diff --git a/.settings/language.settings.xml b/.settings/language.settings.xml index 1df044e..888f6e2 100644 --- a/.settings/language.settings.xml +++ b/.settings/language.settings.xml @@ -5,7 +5,7 @@ - + diff --git a/.travis.yml b/.travis.yml index e915788..8953684 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,25 +38,7 @@ matrix: env: - MATRIX_EVAL="CC=gcc-6 && CXX=g++-6" compiler: gcc - - - os: osx - osx_image: xcode8 - env: - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" - compiler: gcc - - - os: osx - osx_image: xcode8 - env: - - MATRIX_EVAL="brew install gcc5 && CC=gcc-5 && CXX=g++-5" - compiler: gcc - - - os: osx - osx_image: xcode8 - env: - - MATRIX_EVAL="brew install gcc && CC=gcc-6 && CXX=g++-6" - compiler: gcc - + # works on Precise and Trusty - os: linux addons: diff --git a/R/1kgp3_chr20_105_1.jpeg b/R/1kgp3_chr20_105_1.jpeg new file mode 100644 index 0000000..fbe9162 Binary files /dev/null and b/R/1kgp3_chr20_105_1.jpeg differ diff --git a/R/1kgp3_chr20_105_1_symmetric.jpeg b/R/1kgp3_chr20_105_1_symmetric.jpeg new file mode 100644 index 0000000..6aa970f Binary files /dev/null and b/R/1kgp3_chr20_105_1_symmetric.jpeg differ diff --git a/R/1kgp3_chr20_105_1_triangular.jpeg b/R/1kgp3_chr20_105_1_triangular.jpeg new file mode 100644 index 0000000..b757172 Binary files /dev/null and b/R/1kgp3_chr20_105_1_triangular.jpeg differ diff --git a/R/1kgp3_chr20_45_part1_10.jpeg b/R/1kgp3_chr20_45_part1_10.jpeg new file mode 100644 index 0000000..adcb513 Binary files /dev/null and b/R/1kgp3_chr20_45_part1_10.jpeg differ diff --git a/R/1kgp3_chr20_large_region.jpeg b/R/1kgp3_chr20_large_region.jpeg new file mode 100644 index 0000000..ac56b45 Binary files /dev/null and b/R/1kgp3_chr20_large_region.jpeg differ diff --git a/R/example_region.R b/R/example_region.R new file mode 100644 index 0000000..9f731b8 --- /dev/null +++ b/R/example_region.R @@ -0,0 +1,25 @@ +# Specify colour scheme +colors<-paste0(colorRampPalette(c("blue","red"))(10),seq(0,100,length.out = 11)) +colors[1]<-paste0(colors[1],"0") +colors[length(colors)]<- substr(colors[length(colors)],1,7) + +# Define support functions +plotLDRegion<-function(dataSource, from, to, ...){ + # B is A but sorted for plotting reasons (Z-stack) + b<-dataSource[dataSource$V3>=from & dataSource$V3 <= to & dataSource$V5 >= from & dataSource$V5 <= to,] + b<-b[order(b$V12,decreasing = F),] + plot(b$V3,b$V5,pch=20,cex=.1,col=colors[cut(b$V12,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from,to),ylim=c(from,to),xaxs="i",yaxs="i", ...) +} + +plotLDRegionTriangular<-function(dataSource, from, to, ...){ + # B is A but sorted for plotting reasons (Z-stack) + b<-dataSource[dataSource$V3>=from & dataSource$V5<=to & dataSource$V3>=from & dataSource$V5<=to,] + b<-b[b$V3sum(b)*0.8)[1],col="red",lwd=2) -abline(v=which(cumsum(b)>sum(b)*0.9)[1],col="pink",lwd=2) -abline(v=which(cumsum(b)>sum(b)*0.95)[1],col="yellow",lwd=2) diff --git a/R/plot_functions.R b/R/plot_functions.R deleted file mode 100644 index abab757..0000000 --- a/R/plot_functions.R +++ /dev/null @@ -1,156 +0,0 @@ -colors<-paste0(colorRampPalette(c("blue","red"))(10),seq(0,100,length.out = 11)) -colors[1]<-paste0(colors[1],"0") -colors[length(colors)]<- substr(colors[length(colors)],1,7) - -occurences<-sort(table(ld$V4),decreasing = T) -multiples<-as.numeric(names(occurences[occurences>5])) -#Decay -decay<-function(pos, ...){ - par(mar=c(2,2,2,2)) - plot(ld[ld$V4==multiples[pos],6] - ld[ld$V4==multiples[pos],4],ld[ld$V4==multiples[pos],13],pch=20,ylim=c(0,1),cex=(-10*(log10(.5)+log10(.5)))/ld[ld$V4==multiples[pos],2], ...) -} -decay(1) - -##test -testModel<-function(pos){ - dat<-data.frame("y"=a[a$V3==multiples[pos],12], - "x"=a[a$V3==multiples[pos],5]) - dat$x<-dat$x-min(dat$x)+1 - - #plot(dat$x,dat$y) - mod <- nls(y ~ a*x^(-a*b), data = dat, start = list(a = 1, b = 0.15),algorithm="port",weights = a[a$V3==multiples[pos],12]) - # plot decay - modelRatio<-(coef(mod)["a"]*dat$x^(-coef(mod)["a"]*coef(mod)["b"]))/mean(dat$y) - - par(mfrow=c(2,1)) - decay(pos,col=c("blue","red")[as.factor(modelRatio>2)]) - lines(a[a$V3==multiples[pos],5],coef(mod)["a"]*1/dat$x^coef(mod)["b"],lwd=2,col="red") - abline(h=mean(dat$y),lwd=2,col="blue",lty="dashed") - plot(modelRatio,type="l") -} - -plot(a$V3,a$V5,pch=20,cex=.5,col=colors[cut(a$V12,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(750e3,950e3),ylim=c(750e3,950e3)) - -# -estimator<-function(pos){ - a1<- 1 - pnorm((a[a$V3==multiples[pos],12]-mean(a[a$V3==multiples[pos],12]))/sd(a[a$V3==multiples[pos],12]),lower.tail = F) - b1<- -log10(a[a$V3==multiples[pos],13]) - #plot(a1) - composite<- a1 * a[a$V3==multiples[pos],12] - plot(composite,ylim=c(0,1)) - return(composite) -} -x<-seq(mean(a[a$V3==multiples[1],12])-3*sd(a[a$V3==multiples[1],12]),mean(a[a$V3==multiples[1],12])+3*sd(a[a$V3==multiples[1],12]),length=1000) -plot(x,dnorm(x,mean(a[a$V3==multiples[1],12]),sd(a[a$V3==multiples[1],12])),type="l") - -gtf<-read.delim("~/Documents/Homo_sapiens.GRCh37.75_main.txt",head=F) - -from<-min(ld$V4[ld$V3==0&ld$V5==0]) -to<-max(ld$V6[ld$V3==0&ld$V5==0]) - -plotLDRegion<-function(dataSource, from, to, ...){ - #temp<-gtf[(gtf$V4<=from>f$V5>from)|(gtf$V4>from>f$V4=from>f$V5<=to)|(gtf$V5>from>f$V5<=to),] - #temp<-temp[temp$V2=="protein_coding",] - #layout(mat = c(1,2,3),heights = c(1,2,8)) - #par(mar=c(0,3,3,3)) - #plot(-1,-1,ylim=c(0,1),xlim=c(min(temp$V4),max(temp$V5)),xaxt="n",xaxs="i") - #rect(temp$V4,0.1,temp$V5,0.9,col="black") - #par(mar=c(0,3,0,3)) - #plot(seq(from,to,by=1000)[-1],table(cut(dataSource[dataSource$V4>=from&dataSource$V6<=to,3],seq(from,to,by=1000))),pch=20,cex=1,xaxt="n",xaxs="i") - #par(mar=c(3,3,0,3)) - # B is A but sorted for plotting reasons (Z-stack) - b<-dataSource[dataSource$V2<30,] - b<-b[order(b$V13,decreasing = F),] - plot(b$V4,b$V6,pch=20,cex=.2,col=colors[cut(b$V13,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from,to),ylim=c(from,to),xaxs="i",yaxs="i", ...) -} - -for(i in 1:49){ - from=(i-1)*5e6; - to=(i*5e6); - filename = sprintf("~/Desktop/chr1_slices/metabric/chr1_block%i.jpeg",i) - jpeg(filename = filename,width = 2250,height = 1500, pointsize = 10,units = "px") - #temp<-gtf[gtf$V1==1&((gtf$V4<=from>f$V5>from)|(gtf$V4>from>f$V4=from>f$V5<=to)|(gtf$V5>from>f$V5<=to)),] - #temp<-temp[temp$V2=="protein_coding"&temp$V3=="gene",] - #layout(mat = c(1,2,3),heights = c(1,2,8)) - #par(mar=c(0,3,3,3)) - #plot(-1,-1,ylim=c(0,1),xlim=c(from, to),xaxt="n",xaxs="i") - #if(nrow(temp) != 0) - # rect(temp$V4,0.1,temp$V5,0.9,col="black") - #par(mar=c(0,3,0,3)) - #plot(seq(from,to,by=1000)[-1],table(cut(ld[ld$V4>=from&ld$V4<=to&ld$V6>=from&ld$V6<=to,4],seq(from,to,by=1000))),pch=20,cex=1,xaxt="n",xaxs="i") - - #layout(mat = c(1,2),heights = c(1,4)) - #par(mar=c(0,3,1,3)) - plot.new() - pushViewport(viewport(y = 0.6, height = 0.2, width = 0.8, angle=135)) - plotRecomb(recomb, "chr1", from, to, xaxt='n', las=2, newpage=FALSE) - #par(mar=c(0,3,0,3)) - #segRegion<-seg[seg$Chromosome==1&((seg$Start>=from&seg$Start<=to)|(seg$End>=from&seg$End<=to)),] - #segRegion<-segRegion[abs(segRegion$Segment_Mean)>0.5,] - recombTemp<-recomb[recomb$V3>=from&recomb$V4<=to&recomb$V2=="chr1",] - #plot(-1,-1,xlim=c(from,to),ylim=c(min(segRegion$Segment_Mean),max(segRegion$Segment_Mean)),xaxs="i") - #rect(from, -0.5, to, 0.5, col="lightgrey",border=NA) - #points(segRegion$Start,segRegion$Segment_Mean,pch=20) - #points(segRegion$End,segRegion$Segment_Mean,pch=17) - #abline(v=recomb$V3[recomb$V2=="chr1"],col="grey",lty="dashed") - par(mar=c(3,3,0,3)) - internal <- ld[ld$V4>=from-(to-from)&ld$V4<=to&ld$V6>=from&ld$V6<=to+(to-from),c(4,6,13)] - plot(internal$V4+internal$V6,internal$V6-internal$V4,pch=20,cex=.25,col=colors[cut(internal$V13,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from*2,to*2),ylim=c(0, 500e3),xaxs="i",yaxs="i",las=2) - abline(v=recomb$V3[recomb$V2=="chr1"],col="grey",lty="dashed") - dev.off() -} - -plot(unlist(lapply(split(test,test$V3),function(a)sum(a$V12>0.5))),pch=20,type="l") - -scaleFunction<-function(distance){ - if(distance > 50e3) - return(0) - return(1/50e3*(50e3-distance)) -} - -scoreFunction<-function(R2, startPos, endPosVector){ - vec <- rep(0, length(R2)) - M <- 0.6 - vec[1] = scaleFunction(endPosVector[1] - startPos) * R2[1] - M - if(length(R2) == 1) - return(vec) - - for(i in 2:length(R2)){ - vec[i] = vec[i-1] + (scaleFunction(endPosVector[i] - endPosVector[i-1]) * R2[i] - M) - } - return(vec) -} - -kernel<-function(vector, bandwidth = 5){ - normaliser<-2*sum(3/4*(1-((0:(bandwidth-1))/bandwidth)^2)) - - ret<-rep(0,length(vector)) - for(i in (bandwidth+2):(length(vector)-bandwidth-1)){ - - curSum = 0 - offset = bandwidth - for(j in (i-(bandwidth+1)):(i-1)){ - curSum = curSum + vector[j]*(3/4*(1-((offset-1)/bandwidth)^2))/normaliser - offset = offset - 1; - } - curSum = curSum + vector[i] - offset = 1; - for(j in (i+1):(i+(bandwidth+1))){ - curSum = curSum + vector[j]*(3/4*(1-((offset-1)/bandwidth)^2))/normaliser - offset = offset + 1 - } - #cat(i,"/",length(ret),": ", ret[i]) - #cat(curSum) - - ret[i] = curSum - } - return(ret) -} - -plotRecomb<-function(data,chr,from,to,...){ - dat<-data[data$V2==chr,] - plot(-1,-1,xlim=c(from,to),ylim=c(0,100), xaxs="i",yaxs="i", ...) - for(i in 1:nrow(dat)){ - rect(dat$V3[i],0,dat$V4[i],dat$V5[i],col = c("black","grey")[as.factor(dat$gender)]) - } -} \ No newline at end of file diff --git a/README.md b/README.md index 085b759..52bac6e 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,32 @@ -[![Build Status](https://travis-ci.org/mklarqvist/Tomahawk.svg?branch=master)](https://travis-ci.org/mklarqvist/Tomahawk) -[![Release](https://img.shields.io/badge/Release-beta_0.1-blue.svg)](https://github.com/mklarqvist/Tomahawk/releases) +[![Build Status](https://travis-ci.org/mklarqvist/tomahawk.svg?branch=master)](https://travis-ci.org/mklarqvist/tomahawk) +[![Release](https://img.shields.io/badge/Release-beta_0.3-blue.svg)](https://github.com/mklarqvist/tomahawk/releases) [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) ![screenshot](tomahawk.png) ## Fast calculation of LD in large-scale cohorts -Tomahawk efficiently represents genotypic data by exploiting basic genetic properties and we directly query this compressed representation to calculate linkage disequilibrium for all pairwise alleles/genotypes in large-scale cohorts. In order to achieve speed, Tomahawk combines primarily two efficient algorithms exploiting different concepts: 1) low genetic diversity, and 2) the large memory registers on modern processors. The first algorithm directly compares run-length encoded representation of genotypes from two vectors. The other precomputes the run-length encodings as 1-bit encodings and use SIMD-instructions to directly compare two bit-vectors. This algorithm also exploits the relatively low genetic diversity within species. Both algorithms are embarrassingly parallel. +Tomahawk efficiently compress genotypic data by exploiting intrinsic genetic properties and we describe algorithms to directly query, manipulate, and explore this jointly compressed representation in-place. We represent genotypic vectors as fixed-width run-length encoded (RLE) objects with the five highest bits encoding for phasing, allele A, allele B, and the remainder as the run-length. This encoding scheme is superior to dynamic-width encoding appro aches in terms of iteration speed but inferior in terms of compressibility. The word size (`uint8_t`, `uint16_t`, `uint32_t`, or `uint64_t`) of RLE entries is fixed across a file and is determined contextually contingent on the number of samples. Tomahawk has two primary internal functions: -The current format specifications (v.0) for `TWK`,`TWI`,`TWO`,`TOI`, and `TGZF` +1) iterate over sites and RLE entries; +2) computing the inner product of compressed genotypic vectors; + +We describe efficient algorithms to calculate genome-wide linkage disequilibrium for all pairwise alleles/genotypes in large-scale cohorts. In order to achieve speed, Tomahawk primarily combines two efficient algorithms exploiting different concepts: 1) low genetic diversity and 2) the large memory registers on modern processors. The first algorithm directly compares RLE entries from two vectors. The other transforms RLE entries to bit-vectors and use SIMD-instructions to directly compare two such bit-vectors. This second algorithm also exploits the relatively low genetic diversity within species using implicit heuristics. Both algorithms are embarrassingly parallel. + +The current format specifications (v.0) for `TWK`,`TWO`, and `TGZF` are available [TWKv0](spec/TWKv0.pdf) -Marcus D. R. Klarqvist () +### Author +Marcus D. R. Klarqvist () +Department of Genetics, University of Cambridge +Wellcome Trust Sanger Institute + ### Installation instructions For modern x86-64 CPUs with `SSE4.2` or later, just type `make` in the `build` directory. If you see compilation errors, you most likely do not have `SSE4.2`. At the present time, we do not support non-x86 CPUs or old CPU architecture. ```bash -git clone --recursive https://github.com/mklarqvist/Tomahawk -cd Tomahawk +git clone --recursive https://github.com/mklarqvist/tomahawk +cd tomahawk cd build make ``` @@ -31,8 +40,6 @@ compiled target. ### Brief usage instructions Tomahawk comprises five primary commands: `import`, `calc`, `view`, `sort`, and `concat`. -The function `stats` have partial support: currently limited to basics for `two` files. -The function `index` is disabled at the moment. Executing `tomahawk` gives a list of commands with brief descriptions and `tomahawk ` gives detailed details for that command. @@ -54,14 +61,11 @@ from Hardy-Weinberg equilibrium with a probability < 0.001 tomahawk import -i file.vcf -o outPrefix -m 0.2 -H 1e-3 ``` -### Import-extend -If you have split up your `vcf`/`bcf` files into multiple disjoint files -(such as one per chromosome) it is possible to iteratively import and extend a `twk` file: -```bash -tomahawk import -i file.bcf -e extend.twk -m 0.2 -H 1e-3 -``` - -### Calculating linkage disequilibrium +### Calculating all-vs-all linkage disequilibrium +In this example we force computations to use phased math (`-p`) and show a live progressbar +(`-d`). Generated data is filtered for minimum genotype frequency (`-a`), squared correlation +coefficient (`-r`) and by test statistics P-value (`-p`). Total computation is partitioned into 990 psuedo-balanced blocks (`-c`) +and select the first partition (`-C`) to compute using 28 threads (`-t`); ```bash tomahawk calc -pdi file.twk -o output_prefix -a 5 -r 0.1 -P 0.1 -c 990 -C 1 -t 28 ``` @@ -98,5 +102,70 @@ Perform k-way merge of partially sorted blocks tomahawk sort -i partial.two -o sorted.two -M ``` -### License +## Plotting +Plotting `two` data converted into `ld` format using the supplied `R` scripts (in the `R` directory). +First transform a `two` file into human-readable `ld` format: +```bash +tomahawk view -hi 1kgp3_chr2_105_1.two > 1kgp3_chr2_105_1.ld +``` + +Either `source` the [R/example_region.R](R/example_region.R) file or copy-paste this code into `R`: +```R +# Specify colour scheme +colors<-paste0(colorRampPalette(c("blue","red"))(10),seq(0,100,length.out = 11)) +colors[1]<-paste0(colors[1],"0") +colors[length(colors)]<- substr(colors[length(colors)],1,7) + +# Define support functions +plotLDRegion<-function(dataSource, from, to, ...){ + # Assumes all the data is from the same chromosome + b<-dataSource[dataSource$V3>=from & dataSource$V3 <= to & dataSource$V5 >= from & dataSource$V5 <= to,] + b<-b[order(b$V13,decreasing = F),] # sort for Z-stack + plot(b$V3,b$V5,pch=20,cex=.2,col=colors[cut(b$V13,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from,to),ylim=c(from,to),xaxs="i",yaxs="i", ...) +} + +plotLDRegionTriangular<-function(dataSource, from, to, ...){ + # Assumes all the data is from the same chromosome + b<-dataSource[dataSource$V3>=from & dataSource$V5<=to & dataSource$V3>=from & dataSource$V5<=to,] + b<-b[b$V3blocks[this->selected_chunk]; - //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << this->selected_chunk << '/' << this->blocks.size() << std::endl; - - // attempt to merge - // If there are both equal - if(selected.fromRow == selected.fromColumn && selected.toRow == selected.toColumn){ - this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow)); - //std::cerr << "same: " << selected << std::endl; - } else { - // No voerlap - //std::cerr << "Not same: " << selected << std::endl; - this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow)); - this->data_to_load.push_back(std::pair(selected.fromColumn, selected.toColumn)); - } - - return true; - } - - bool getSelectedLoadThreads(const U32 threads){ - const block_type& selected = this->blocks[selected_chunk]; - //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << "Thread balancing..." << std::endl; - - this->thread_distribution.resize(threads); - - if(threads == 1){ - this->thread_distribution[0].push_back(block_type(0, selected.getRows(), 0, selected.getColumns(), selected.fromRow, selected.toRow, selected.fromColumn, selected.toColumn, selected.isDiagonal())); - return true; - } - - // - if(selected.isDiagonal()){ - if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is diagonal (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl; - std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl; - } - - U32 loadThread = selected.getSize()/threads; - U32 it = 0; - U32 from = 0; - U32 fromCol = 0; - U32 threadID = 0; - - // - for(U32 i = 0; i < selected.getRows(); ++i){ - for(U32 j = i; j < selected.getColumns(); ++j){ - ++it; - - // If number of comparions over threshold - if(it >= loadThread){ - // if broken over a line - // i.e. not broken on the same line number - if(from == i){ - //std::cerr << "B\t" << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << selected.fromRow+from << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn+j << std::endl; - this->thread_distribution[threadID].push_back(block_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j)); - } - // If broken over multiple lines - else { - if(threadID + 1 == threads){ - i = selected.getRows() - 1; - j = selected.getColumns(); - } - - // If next line: no middle full lines - if(from + 1 == i){ - //std::cerr << "N\t" << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << "FALSE" << std::endl; - //std::cerr << "N\t" << threadID << ": " << i << '-' << i+1 << '\t' << i << '-' << j << '\t' << "FALSE" << std::endl; - this->thread_distribution[threadID].push_back(block_type(from, from+1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn)); - this->thread_distribution[threadID].push_back(block_type(i, i+1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j)); - fromCol = j; - from = i; - } else { - //std::cerr << "E\t" << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << selected.fromRow+from << '-' << selected.fromRow+(from+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn << std::endl; - //std::cerr << "E\t" << threadID << ": " << from + 1 << '-' << i << '\t' << from + 1 << '-' << selected.getColumns() << '\t' << selected.fromRow+from+1 << '-' << selected.fromRow+(i) << '\t' << selected.fromColumn+from+1 << '-' << selected.toColumn << std::endl; - //std::cerr << "E\t" << threadID << ": " << i << '-' << i + 1 << '\t' << i << '-' << j << '\t' << selected.fromRow+i << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+i << '-' << selected.fromColumn+j << std::endl; - this->thread_distribution[threadID].push_back(block_type(from, from + 1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn)); - this->thread_distribution[threadID].push_back(block_type(from + 1, i, from + 1, selected.getColumns(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+from+1, selected.toColumn, true)); - this->thread_distribution[threadID].push_back(block_type(i, i + 1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j)); - } - } - it = 0; - from = i; - fromCol = j; - ++threadID; - } - - } - } - } - // Is not a diagonal square - else { - if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is square (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl; - std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl; - } - - U32 loadThread = selected.getSize()/threads; - U32 it = 0; - U32 from = 0; - U32 fromCol = selected.getRows(); - U32 threadID = 0; - - // - for(U32 i = 0; i < selected.getRows(); ++i){ - for(U32 j = selected.getRows(); j < 2*selected.getRows(); ++j){ - ++it; - - // If number of comparions over threshold - if(it >= loadThread){ - // if broken over a line - // i.e. not broken on the same line number - if(from == i){ - //std::cerr << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << "FALSE" << std::endl; - this->thread_distribution[threadID].push_back(block_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j)); - } - // If broken over multiple lines - else { - if(threadID + 1 == threads){ - i = selected.getRows() - 1; - j = 2*selected.getRows(); - } - - // If next line: no middle full lines - if(from + 1 == i){ - //std::cerr << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl; - //std::cerr << threadID << ": " << i << '-' << i+1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl; - this->thread_distribution[threadID].push_back(block_type(from, from+1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows())); - this->thread_distribution[threadID].push_back(block_type(i, i+1, 2*selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+2*selected.getRows(), selected.fromColumn+j)); - fromCol = j; - from = i; - } else { - //std::cerr << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl; - //std::cerr << threadID << ": " << from + 1 << '-' << i << '\t' << selected.getRows() << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl; - //std::cerr << threadID << ": " << i << '-' << i + 1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl; - this->thread_distribution[threadID].push_back(block_type(from, from + 1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows())); - this->thread_distribution[threadID].push_back(block_type(from + 1, i, selected.getRows(), 2*selected.getRows(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+selected.getRows(), selected.fromColumn+2*selected.getRows())); - this->thread_distribution[threadID].push_back(block_type(i, i + 1, selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+selected.getRows(), selected.fromColumn+j)); - } - } - it = 0; - from = i; - fromCol = j; - ++threadID; - } - - } - } - } - - // assertion - - - //std::cerr << "DEBUG" << std::endl; - //for(U32 i = 0; i < this->thread_distribution.size(); ++i) - // std::cerr << i << '\t' << this->thread_distribution[i].size() << std::endl; - //std::cerr << "Has: " << this->thread_distribution.size() << " thread blocks" << std::endl; - - return true; - } - - bool setSelected(const S32 selected){ - if(selected < 0){ - std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot set select a negative chunk..." << std::endl; - return false; - } - - this->selected_chunk = selected; - return true; - } - - bool setDesired(const S32 desired){ - if(desired < 0){ - std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut workload into a negative number of blocks..." << std::endl; - return false; - } - - this->desired_chunks = desired; - return true; - } - - bool Build(const U32 total_blocks, const U32 threads){ - if(this->selected_chunk > this->desired_chunks){ - std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Incorrectly selected block (" << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl; - return false; - } - - // If selecting > 1 chunk - if(this->desired_chunks != 1){ - U32 cutSize = 1; - //std::vector backup_cuts; - for(U32 i = 1; i < total_blocks; ++i){ - - if((i*i - i) / 2 == this->desired_chunks) - cutSize = i; - - } - - if(cutSize == 1){ - std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut into " << this->desired_chunks << " chunks" << std::endl; - return(false); - } - - U32 total = 0; // Sanity - //std::cerr << "cut-size is: " << cutSize << std::endl; - const U32 rowLength = total_blocks / cutSize; - for(U32 i = 0; i < cutSize-1; ++i){ - //std::cerr << i << '/' << cutSize-1 << std::endl; - U32 j = i; - U32 fromX = i*rowLength; - U32 toX = (i+1)*rowLength; - if(i + 1 == cutSize - 1) - toX = total_blocks; - - for(; j < cutSize-1; ++j){ - U32 fromY = j*rowLength; - U32 toY = (j+1)*rowLength; - - - if(j + 1 == cutSize - 1) - toY = total_blocks; - - //std::cerr << "(" << i << ',' << j << ")\t" << fromX << '-' << toX << '\t' << fromY << '-' << toY << std::endl; - this->blocks.push_back(block_type(fromX, toX, fromY, toY)); - ++total; - } - } - - //std::cerr << "Total: " << total << '/' << this->desired_chunks << std::endl; - if(total != this->desired_chunks){ - std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Corrupted balancing..." << std::endl; - return(false); - } - - } else { - // All blocks - this->blocks.push_back(block_type(0, total_blocks, 0, total_blocks)); - } - - // What data do we load? - this->getSelectedLoad(); - - // Divide data into threads - if(!this->getSelectedLoadThreads(threads)) - return false; - - return true; - } - - inline std::vector< std::pair >& getLoad(void){ return(this->data_to_load); } - -public: - U32 selected_chunk; - U32 desired_chunks; - - std::vector blocks; - std::vector< std::pair > data_to_load; - std::vector< std::vector > thread_distribution; -}; - -} -#endif /* ALGORITHM_BALANCER_H_ */ diff --git a/src/algorithm/compression/RunLengthEncoding.h b/src/algorithm/compression/RunLengthEncoding.h deleted file mode 100644 index 3331baf..0000000 --- a/src/algorithm/compression/RunLengthEncoding.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef ALGORITHM_COMPRESSION_RUNLENGTHENCODING_H_ -#define ALGORITHM_COMPRESSION_RUNLENGTHENCODING_H_ - -namespace Tomahawk{ -namespace Algorithm{ - -template -inline int PACK3(const BYTE& ref, char* target, T length){ - if(length <= 7){ - *target++ = 0 | ((ref & 15) << 3) | (length & 7); // highest bit is 0 - return 1; - } - - char* target0 = target; - *target++ = 128 | ((ref & 15) << 3) | (length & 7); - length >>= 3; - - while(true){ - if(length <= 7){ - *target++ = 0 | (length & 127); // highest bit is 0 - length >>= 7; - break; - } - - *target++ = 128 | (length & 127); - length >>= 7; - } - return(target - target0); -} - -template -inline int UNPACK3(char* target, T length, BYTE& ref){ - if((*target & 128) == 0){ - length = *target & 7; - ref = *target >> 3; - return 1; - } - - char* target0 = target; - length = *target & 7; - ref = (*target >> 3) & 15; - ++target; - U32 offset = 3; - - while(true){ - length |= (*target & 127) << offset; - offset += 7; - - if((*target & 128) == 0) break; - ++target; - } - - return(target - target0); -} - -} -} - -#endif /* ALGORITHM_COMPRESSION_RUNLENGTHENCODING_H_ */ diff --git a/src/algorithm/compression/TomahawkImportRLE.h b/src/algorithm/compression/genotype_encoder.h similarity index 92% rename from src/algorithm/compression/TomahawkImportRLE.h rename to src/algorithm/compression/genotype_encoder.h index 66a39c4..5807295 100644 --- a/src/algorithm/compression/TomahawkImportRLE.h +++ b/src/algorithm/compression/genotype_encoder.h @@ -4,9 +4,9 @@ #include #include -#include "../../math/FisherMath.h" +#include "../../math/fisher_math.h" #include "../../io/bcf/BCFReader.h" -#include "RunLengthEncoding.h" +#include "../../io/vcf/VCFLines.h" namespace Tomahawk{ namespace Algorithm{ @@ -181,26 +181,26 @@ struct TomahawkImportRLEHelper{ inline const U64 countAlleles(void) const{ return(this->countsAlleles[0] + this->countsAlleles[1] + this->countsAlleles[2]); } - U64 countsGenotypes[16]; - U64 countsAlleles[3]; + U64 countsGenotypes[16]; + U64 countsAlleles[3]; float MAF; float MGF; float HWE_P; - bool missingValues; - bool phased; - const U64 expectedSamples; + bool missingValues; + bool phased; + const U64 expectedSamples; FisherMath fisherTable; }; -class TomahawkImportRLE { - typedef TomahawkImportRLE self_type; - typedef bool (Tomahawk::Algorithm::TomahawkImportRLE::*rleFunction)(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function - typedef bool (Tomahawk::Algorithm::TomahawkImportRLE::*bcfFunction)(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function +class GenotypeEncoder { + typedef GenotypeEncoder self_type; + typedef bool (self_type::*rleFunction)(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function + typedef bool (self_type::*bcfFunction)(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function typedef TomahawkImportRLEHelper helper_type; public: - TomahawkImportRLE(const U64 samples) : + GenotypeEncoder(const U64 samples) : n_samples(samples), encode(nullptr), encodeComplex(nullptr), @@ -212,7 +212,7 @@ class TomahawkImportRLE { { } - ~TomahawkImportRLE(){ + ~GenotypeEncoder(){ } void DetermineBitWidth(void){ @@ -284,12 +284,12 @@ class TomahawkImportRLE { template bool RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); private: - U64 n_samples; - rleFunction encode; // encoding function - rleFunction encodeComplex; // encoding function + U64 n_samples; + rleFunction encode; // encoding function + rleFunction encodeComplex; // encoding function bcfFunction encodeBCF; - BYTE bit_width; - BYTE shiftSize; // bit shift size + BYTE bit_width; + BYTE shiftSize; // bit shift size helper_type helper; public: @@ -297,7 +297,7 @@ class TomahawkImportRLE { }; template -bool TomahawkImportRLE::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){ +bool GenotypeEncoder::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){ //std::cerr << meta.size() << '\t' << runs.size(); meta += (U32)line.body->POS + 1; @@ -360,12 +360,13 @@ bool TomahawkImportRLE::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicB this->helper.calculateHardyWeinberg(); // Position - U32& position = *reinterpret_cast(&meta[meta.pointer - 5]); + U32& position = *reinterpret_cast(&meta[meta.size() - 5]); position <<= 2; position |= this->helper.phased << 1; position |= this->helper.missingValues << 0; meta += this->helper.MGF; meta += this->helper.HWE_P; + //std::cerr << this->helper.MGF << std::endl; //n_runs = runs.pointer - runs_pointer_begin; // temp meta += n_runs; @@ -374,16 +375,16 @@ bool TomahawkImportRLE::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicB } template -bool TomahawkImportRLE::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){ +bool GenotypeEncoder::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){ meta += line.position; meta += line.ref_alt; - /////////////////////////////// + /*////////////////////////////// // Encoding: // First 8|T| - TOMAHAWK_SNP_PACK_WIDTH bits encode the run length // remaining TOMAHAWK_SNP_PACK_WIDTH bits encode // TOMAHAWK_ALLELE_PACK_WIDTH bits of snpA and TOMAHAWK_ALLELE_PACK_WIDTH bits of snpB - /////////////////////////////// + //////////////////////////////*/ T run_length = 1; // ASCII value for '.' is 46 @@ -466,7 +467,7 @@ bool TomahawkImportRLE::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::Basi this->helper.calculateHardyWeinberg(); // Position - U32& position = *reinterpret_cast(&meta[meta.pointer - 5]); + U32& position = *reinterpret_cast(&meta[meta.size() - 5]); position <<= 2; position |= this->helper.phased << 1; position |= this->helper.missingValues << 0; @@ -480,16 +481,16 @@ bool TomahawkImportRLE::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::Basi template -bool TomahawkImportRLE::RunLengthEncodeComplex(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){ +bool GenotypeEncoder::RunLengthEncodeComplex(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){ meta += line.position; meta += line.ref_alt; - /////////////////////////////// + /*////////////////////////////// // Encoding: // First 8|T| - TOMAHAWK_SNP_PACK_WIDTH bits encode the run length // remaining TOMAHAWK_SNP_PACK_WIDTH bits encode // TOMAHAWK_ALLELE_PACK_WIDTH bits of snpA and TOMAHAWK_ALLELE_PACK_WIDTH bits of snpB - /////////////////////////////// + //////////////////////////////*/ T run_length = 1; // ASCII value for '.' is 46 @@ -570,7 +571,7 @@ bool TomahawkImportRLE::RunLengthEncodeComplex(const VCF::VCFLine& line, IO::Bas this->helper.calculateHardyWeinberg(); // Position - U32& position = *reinterpret_cast(&meta[meta.pointer - 5]); + U32& position = *reinterpret_cast(&meta[meta.size() - 5]); position <<= 2; position |= this->helper.phased << 1; position |= this->helper.missingValues << 0; diff --git a/src/algorithm/GenotypeBitPacker.h b/src/algorithm/genotype_bitpacker.h similarity index 95% rename from src/algorithm/GenotypeBitPacker.h rename to src/algorithm/genotype_bitpacker.h index 1fc83de..bb69111 100644 --- a/src/algorithm/GenotypeBitPacker.h +++ b/src/algorithm/genotype_bitpacker.h @@ -1,5 +1,5 @@ -#ifndef ALGORITHM_GENOTYPEBITPACKER_H_ -#define ALGORITHM_GENOTYPEBITPACKER_H_ +#ifndef ALGORITHM_GENOTYPE_BITPACKER_H_ +#define ALGORITHM_GENOTYPE_BITPACKER_H_ namespace Tomahawk{ namespace Algorithm{ @@ -86,4 +86,4 @@ class GenotypeBitPacker{ } } -#endif /* ALGORITHM_GENOTYPEBITPACKER_H_ */ +#endif /* ALGORITHM_GENOTYPE_BITPACKER_H_ */ diff --git a/src/algorithm/LoadBalancerBlock.h b/src/algorithm/load_balancer_block.h similarity index 81% rename from src/algorithm/LoadBalancerBlock.h rename to src/algorithm/load_balancer_block.h index a01daeb..d6f45f3 100644 --- a/src/algorithm/LoadBalancerBlock.h +++ b/src/algorithm/load_balancer_block.h @@ -1,8 +1,8 @@ -#ifndef ALGORITHM_LOADBALANCERBLOCK_H_ -#define ALGORITHM_LOADBALANCERBLOCK_H_ +#ifndef ALGORITHM_LOAD_BALANCER_BLOCK_H_ +#define ALGORITHM_LOAD_BALANCER_BLOCK_H_ #include -#include "../support/TypeDefinitions.h" +#include "../support/type_definitions.h" namespace Tomahawk{ @@ -21,15 +21,15 @@ struct LoadBalancerBlock{ {} LoadBalancerBlock(const U32 fromRow, const U32 toRow, const U32 fromColumn, const U32 toColumn, const U32 fromRowAbsolute, const U32 toRowAbsolute, const U32 fromColumnAbsolute, const U32 toColumnAbsolute, bool stagger = false) : - fromRow(fromRow), - toRow(toRow), - fromColumn(fromColumn), - toColumn(toColumn), - staggered(stagger), - fromRowAbsolute(fromRowAbsolute), - toRowAbsolute(toRowAbsolute), - fromColumnAbsolute(fromColumnAbsolute), - toColumnAbsolute(toColumnAbsolute) + fromRow(fromRow), + toRow(toRow), + fromColumn(fromColumn), + toColumn(toColumn), + staggered(stagger), + fromRowAbsolute(fromRowAbsolute), + toRowAbsolute(toRowAbsolute), + fromColumnAbsolute(fromColumnAbsolute), + toColumnAbsolute(toColumnAbsolute) {} ~LoadBalancerBlock(){} @@ -44,10 +44,10 @@ struct LoadBalancerBlock{ } inline LoadBalancerBlock& operator()(const U32 fromRow, const U32 toRow, const U32 fromColumn, const U32 toColumn, const bool diagonal){ - this->fromRow = fromRow; - this->toRow = toRow; + this->fromRow = fromRow; + this->toRow = toRow; this->fromColumn = fromColumn; - this->toColumn= toColumn; + this->toColumn = toColumn; return(*this); } @@ -80,4 +80,4 @@ struct LoadBalancerBlock{ } -#endif /* ALGORITHM_LOADBALANCERBLOCK_H_ */ +#endif /* ALGORITHM_LOAD_BALANCER_BLOCK_H_ */ diff --git a/src/algorithm/load_balancer_ld.cpp b/src/algorithm/load_balancer_ld.cpp new file mode 100644 index 0000000..d5a09af --- /dev/null +++ b/src/algorithm/load_balancer_ld.cpp @@ -0,0 +1,257 @@ +#include "load_balancer_ld.h" + +namespace Tomahawk{ + +LoadBalancerLD::LoadBalancerLD() : selected_chunk(0), desired_chunks(1){} +LoadBalancerLD::~LoadBalancerLD(){} + +bool LoadBalancerLD::getSelectedLoad(){ + //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << "What data to load?" << std::endl; + value_type selected = this->blocks[this->selected_chunk]; + //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << this->selected_chunk << '/' << this->blocks.size() << std::endl; + + // attempt to merge + // If there are both equal + if(selected.fromRow == selected.fromColumn && selected.toRow == selected.toColumn){ + this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow)); + //std::cerr << "same: " << selected << std::endl; + } else { + // No voerlap + //std::cerr << "Not same: " << selected << std::endl; + this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow)); + this->data_to_load.push_back(std::pair(selected.fromColumn, selected.toColumn)); + } + + return true; +} + +bool LoadBalancerLD::getSelectedLoadThreads(const U32 threads){ + const value_type& selected = this->blocks[selected_chunk]; + //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << "Thread balancing..." << std::endl; + + this->thread_distribution.resize(threads); + + if(threads == 1){ + this->thread_distribution[0].push_back(value_type(0, selected.getRows(), 0, selected.getColumns(), selected.fromRow, selected.toRow, selected.fromColumn, selected.toColumn, selected.isDiagonal())); + return true; + } + + // + if(selected.isDiagonal()){ + if(!SILENT){ + std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is diagonal (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl; + } + + U32 loadThread = selected.getSize()/threads; + U32 it = 0; + U32 from = 0; + U32 fromCol = 0; + U32 threadID = 0; + + // + for(U32 i = 0; i < selected.getRows(); ++i){ + for(U32 j = i; j < selected.getColumns(); ++j){ + ++it; + + // If number of comparions over threshold + if(it >= loadThread){ + // if broken over a line + // i.e. not broken on the same line number + if(from == i){ + //std::cerr << "B\t" << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << selected.fromRow+from << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn+j << std::endl; + this->thread_distribution[threadID].push_back(value_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j)); + } + // If broken over multiple lines + else { + if(threadID + 1 == threads){ + i = selected.getRows() - 1; + j = selected.getColumns(); + } + + // If next line: no middle full lines + if(from + 1 == i){ + //std::cerr << "N\t" << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << "FALSE" << std::endl; + //std::cerr << "N\t" << threadID << ": " << i << '-' << i+1 << '\t' << i << '-' << j << '\t' << "FALSE" << std::endl; + this->thread_distribution[threadID].push_back(value_type(from, from+1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn)); + this->thread_distribution[threadID].push_back(value_type(i, i+1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j)); + fromCol = j; + from = i; + } else { + //std::cerr << "E\t" << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << selected.fromRow+from << '-' << selected.fromRow+(from+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn << std::endl; + //std::cerr << "E\t" << threadID << ": " << from + 1 << '-' << i << '\t' << from + 1 << '-' << selected.getColumns() << '\t' << selected.fromRow+from+1 << '-' << selected.fromRow+(i) << '\t' << selected.fromColumn+from+1 << '-' << selected.toColumn << std::endl; + //std::cerr << "E\t" << threadID << ": " << i << '-' << i + 1 << '\t' << i << '-' << j << '\t' << selected.fromRow+i << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+i << '-' << selected.fromColumn+j << std::endl; + this->thread_distribution[threadID].push_back(value_type(from, from + 1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn)); + this->thread_distribution[threadID].push_back(value_type(from + 1, i, from + 1, selected.getColumns(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+from+1, selected.toColumn, true)); + this->thread_distribution[threadID].push_back(value_type(i, i + 1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j)); + } + } + it = 0; + from = i; + fromCol = j; + ++threadID; + } + + } + } + } + // Is not a diagonal square + else { + if(!SILENT){ + std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is square (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl; + } + + U32 loadThread = selected.getSize()/threads; + U32 it = 0; + U32 from = 0; + U32 fromCol = selected.getRows(); + U32 threadID = 0; + + // + for(U32 i = 0; i < selected.getRows(); ++i){ + for(U32 j = selected.getRows(); j < 2*selected.getRows(); ++j){ + ++it; + + // If number of comparions over threshold + if(it >= loadThread){ + // if broken over a line + // i.e. not broken on the same line number + if(from == i){ + //std::cerr << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << "FALSE" << std::endl; + this->thread_distribution[threadID].push_back(value_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j)); + } + // If broken over multiple lines + else { + if(threadID + 1 == threads){ + i = selected.getRows() - 1; + j = 2*selected.getRows(); + } + + // If next line: no middle full lines + if(from + 1 == i){ + //std::cerr << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl; + //std::cerr << threadID << ": " << i << '-' << i+1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl; + this->thread_distribution[threadID].push_back(value_type(from, from+1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows())); + this->thread_distribution[threadID].push_back(value_type(i, i+1, 2*selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+2*selected.getRows(), selected.fromColumn+j)); + fromCol = j; + from = i; + } else { + //std::cerr << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl; + //std::cerr << threadID << ": " << from + 1 << '-' << i << '\t' << selected.getRows() << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl; + //std::cerr << threadID << ": " << i << '-' << i + 1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl; + this->thread_distribution[threadID].push_back(value_type(from, from + 1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows())); + this->thread_distribution[threadID].push_back(value_type(from + 1, i, selected.getRows(), 2*selected.getRows(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+selected.getRows(), selected.fromColumn+2*selected.getRows())); + this->thread_distribution[threadID].push_back(value_type(i, i + 1, selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+selected.getRows(), selected.fromColumn+j)); + } + } + it = 0; + from = i; + fromCol = j; + ++threadID; + } + + } + } + } + + // assertion + + + //std::cerr << "DEBUG" << std::endl; + //for(U32 i = 0; i < this->thread_distribution.size(); ++i) + // std::cerr << i << '\t' << this->thread_distribution[i].size() << std::endl; + //std::cerr << "Has: " << this->thread_distribution.size() << " thread blocks" << std::endl; + + return true; +} + +bool LoadBalancerLD::setSelected(const S32 selected){ + if(selected < 0){ + std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot set select a negative chunk..." << std::endl; + return false; + } + + this->selected_chunk = selected; + return true; +} + +bool LoadBalancerLD::setDesired(const S32 desired){ + if(desired < 0){ + std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut workload into a negative number of blocks..." << std::endl; + return false; + } + + this->desired_chunks = desired; + return true; +} + +bool LoadBalancerLD::Build(const U32 total_blocks, const U32 threads){ + if(this->selected_chunk > this->desired_chunks){ + std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Incorrectly selected block (" << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl; + return false; + } + + // If selecting > 1 chunk + if(this->desired_chunks != 1){ + U32 cutSize = 1; + //std::vector backup_cuts; + for(U32 i = 1; i < total_blocks; ++i){ + + if((i*i - i) / 2 == this->desired_chunks) + cutSize = i; + + } + + if(cutSize == 1){ + std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut into " << this->desired_chunks << " chunks" << std::endl; + return(false); + } + + U32 total = 0; // Sanity + //std::cerr << "cut-size is: " << cutSize << std::endl; + const U32 rowLength = total_blocks / cutSize; + for(U32 i = 0; i < cutSize-1; ++i){ + //std::cerr << i << '/' << cutSize-1 << std::endl; + U32 j = i; + U32 fromX = i*rowLength; + U32 toX = (i+1)*rowLength; + if(i + 1 == cutSize - 1) + toX = total_blocks; + + for(; j < cutSize-1; ++j){ + U32 fromY = j*rowLength; + U32 toY = (j+1)*rowLength; + + + if(j + 1 == cutSize - 1) + toY = total_blocks; + + //std::cerr << "(" << i << ',' << j << ")\t" << fromX << '-' << toX << '\t' << fromY << '-' << toY << std::endl; + this->blocks.push_back(value_type(fromX, toX, fromY, toY)); + ++total; + } + } + + //std::cerr << "Total: " << total << '/' << this->desired_chunks << std::endl; + if(total != this->desired_chunks){ + std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Corrupted balancing..." << std::endl; + return(false); + } + + } else { + // All blocks + this->blocks.push_back(value_type(0, total_blocks, 0, total_blocks)); + } + + // What data do we load? + this->getSelectedLoad(); + + // Divide data into threads + if(!this->getSelectedLoadThreads(threads)) + return false; + + return true; +} + +} diff --git a/src/algorithm/load_balancer_ld.h b/src/algorithm/load_balancer_ld.h new file mode 100644 index 0000000..b316caf --- /dev/null +++ b/src/algorithm/load_balancer_ld.h @@ -0,0 +1,42 @@ +#ifndef ALGORITHM_LOAD_BALANCER_LD_H_ +#define ALGORITHM_LOAD_BALANCER_LD_H_ + +#include "../support/MagicConstants.h" +#include "../support/helpers.h" +#include "load_balancer_block.h" + +namespace Tomahawk{ + +class LoadBalancerLD{ +private: + typedef LoadBalancerLD self_type; + typedef LoadBalancerBlock value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + +public: + LoadBalancerLD(); + ~LoadBalancerLD(); + + bool getSelectedLoad(); + bool getSelectedLoadThreads(const U32 threads); + bool setSelected(const S32 selected); + bool setDesired(const S32 desired); + bool Build(const U32 total_blocks, const U32 threads); + inline std::vector< std::pair >& getLoad(void){ return(this->data_to_load); } + +public: + U32 selected_chunk; + U32 desired_chunks; + + std::vector blocks; + std::vector< std::pair > data_to_load; + std::vector< std::vector > thread_distribution; +}; + +} +#endif /* ALGORITHM_LOAD_BALANCER_LD_H_ */ diff --git a/src/algorithm/OpenHashTable.h b/src/algorithm/open_hashtable.h similarity index 97% rename from src/algorithm/OpenHashTable.h rename to src/algorithm/open_hashtable.h index 4cbd86a..ec337cd 100755 --- a/src/algorithm/OpenHashTable.h +++ b/src/algorithm/open_hashtable.h @@ -7,7 +7,7 @@ #include #include -#include "../support/TypeDefinitions.h" +#include "../support/type_definitions.h" #include "../third_party/xxhash/xxhash.h" namespace Tomahawk { @@ -38,8 +38,8 @@ class HashTable{ bool GetItem(const T* key, K*& entry, U32 length = sizeof(T)); bool GetItem(const void* key_address, const T* key, K*& entry, U32 length = sizeof(T)); void clear(); - U32 size(void) const{return this->__size;} - U32 occupied(void) const{return this->__occupied;} + inline const U32& size(void) const{return this->__size;} + inline const U32& occupied(void) const{return this->__occupied;} Entry& operator[](const U32 position){return *this->__entries[position];} K& at(const U32 position){return this->__entries.at(position);} diff --git a/src/algorithm/sort/TomahawkOutputSort.cpp b/src/algorithm/sort/TomahawkOutputSort.cpp deleted file mode 100644 index 377950e..0000000 --- a/src/algorithm/sort/TomahawkOutputSort.cpp +++ /dev/null @@ -1,385 +0,0 @@ -#include - -#include "TomahawkOutputSort.h" - -namespace Tomahawk{ -namespace Algorithm{ -namespace Output{ - -bool TomahawkOutputSorter::sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit){ - if(!this->reader.Open(input)){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl; - return false; - } - - // - std::vector paths = Helpers::filePathBaseExtension(destinationPrefix); - this->basePath = paths[0]; - if(this->basePath.size() > 0) - this->basePath += '/'; - - if(paths[3].size() == Tomahawk::Constants::OUTPUT_LD_SUFFIX.size() && - strncasecmp(&paths[3][0], &Tomahawk::Constants::OUTPUT_LD_SUFFIX[0], Tomahawk::Constants::OUTPUT_LD_SUFFIX.size()) == 0) - this-> baseName = paths[2]; - else this->baseName = paths[1]; - - // Writing - this->reader.setWriterType(0); - this->reader.addLiteral("\n##tomahawk_partialSortCommand=" + Helpers::program_string()); - this->reader.OpenWriter(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX); - - basic_writer_type toi_writer; - toi_writer.open(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX); - toi_header_type headIndex(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC, this->reader.header.samples, this->reader.header.n_contig); - headIndex.controller.sorted = 0; - headIndex.controller.expanded = this->reverse_entries ? 1 : 0; - headIndex.controller.partial_sort = 1; - toi_writer.getNativeStream() << headIndex; - // writer - basic_writer_type& stream = *reinterpret_cast(this->reader.writer->getStream()); - - if(memory_limit < 10e6){ - memory_limit = 10e6; - std::cerr << Helpers::timestamp("SORT") << "Setting memory limit to 10 MB..." << std::endl; - } - - if(this->reverse_entries){ - std::cerr << Helpers::timestamp("SORT") << "Reversing entries..." << std::endl; - memory_limit /= 2; - } else - std::cerr << Helpers::timestamp("SORT") << "Not reversing..." << std::endl; - - // Perform indexed sorting if possible - if(this->reader.hasIndex){ - return(this->__sortIndexed(toi_writer, input, memory_limit)); - } - - std::cerr << Helpers::timestamp("SORT") << "No index found..." << std::endl; - - bool trigger_break = false; - U32 totempole_blocks_written = 0; - while(true){ - if(!SILENT) - std::cerr << Helpers::timestamp("LOG","SORT") << "Reading..." << std::endl; - - if(!this->reader.nextBlockUntil(memory_limit)) - trigger_break = true; - - if(this->reader.output_buffer.size() == 0){ - trigger_break = true; - break; - } - - assert((this->reader.output_buffer.size() % sizeof(entry_type)) == 0); - - totempole_entry totempole; - if(this->reverse_entries) - totempole.entries = 2*(this->reader.output_buffer.size() / sizeof(entry_type)); - else - totempole.entries = this->reader.output_buffer.size() / sizeof(entry_type); - - if(this->reverse_entries){ - const entry_type* entry = nullptr; - while(this->reader.nextVariantLimited(entry)){ - entry_type temp(entry); - temp.swapDirection(); - this->reader.output_buffer.Add((char*)&temp, sizeof(entry_type)); - } - } - - if(!SILENT) - std::cerr << Helpers::timestamp("LOG","SORT") << "Sorting: " << Helpers::ToPrettyString(this->reader.output_buffer.size()/sizeof(entry_sort_type)) << " entries" << std::endl; - - std::sort(reinterpret_cast(&this->reader.output_buffer.data[0]), - reinterpret_cast(&this->reader.output_buffer.data[this->reader.output_buffer.size()])); - - if(!SILENT) - std::cerr << Helpers::timestamp("LOG","SORT") << "Indexing..." << std::endl; - - totempole.byte_offset = stream.getNativeStream().tellp(); - totempole.uncompressed_size = this->reader.output_buffer.size(); - - this->reader.writer->write(this->reader.output_buffer); - totempole.byte_offset_end = stream.getNativeStream().tellp(); - toi_writer.getNativeStream() << totempole; - - if(!SILENT) - std::cerr << Helpers::timestamp("LOG","SORT") << "Writing..." << std::endl; - - ++totempole_blocks_written; - - if(trigger_break) break; - } - - // Make sure TOI is flushed before re-opening and seeking - toi_writer.flush(); - - // TOI - // Update blocks written - std::fstream re(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX, std::ios::in | std::ios::out | std::ios::binary); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to reopen index..." << std::endl; - return false; - } - - re.seekg(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC_LENGTH + sizeof(float) + sizeof(U64) + sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to seek in index..." << std::endl; - return false; - } - - re.write((char*)&totempole_blocks_written, sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to update counts in index..." << std::endl; - return false; - } - re.close(); - - toi_writer.close(); - - this->reader.writer->flush(); - this->reader.writer->close(); - - return true; -} - -bool TomahawkOutputSorter::__sortIndexed(basic_writer_type& toi_writer, const std::string& input, U64 memory_limit){ - std::cerr << Helpers::timestamp("SORT") << "Index found..." << std::endl; - - std::vector< totempole_entry > blocks; - - totempole_entry totempole; - totempole.byte_offset = this->reader.toi_reader[0].byte_offset; - - U64 n_entries = 0; - for(U32 i = totempole.uncompressed_size; i < this->reader.toi_reader.size(); ++i){ - if(totempole.uncompressed_size > memory_limit){ - totempole.byte_offset_end = this->reader.toi_reader[i].byte_offset; - blocks.push_back(totempole); - totempole.byte_offset = this->reader.toi_reader[i].byte_offset; - totempole.entries = 0; - totempole.uncompressed_size = 0; - } - totempole.entries += this->reader.toi_reader[i].entries; - totempole.uncompressed_size += this->reader.toi_reader[i].uncompressed_size; - n_entries += totempole.entries; - } - - // Have to add final - if(totempole.byte_offset != blocks.back().byte_offset){ - totempole.byte_offset_end = this->reader.toi_reader[this->reader.toi_reader.size() - 1].byte_offset_end; - blocks.push_back(totempole); - } - - if(totempole.entries != 0) - blocks.push_back(totempole); - - // Todo: if n_threads > blocks.size() - // set n_threads to block.size() and give each thread 1 block - - // Split workload into different threads - // Each thread get approximately 1/threads amount of work - std::vector< totempole_entry > thread_workload(this->n_threads); - const U64 limit_thread = (U64)((double)blocks.size()/this->n_threads); - U32 current_thread_target = 0; - U64 n_blocks_loaded = 0; - totempole.reset(); - totempole.byte_offset = blocks[0].byte_offset; - - for(U32 i = 0; i < blocks.size(); ++i){ - if(n_blocks_loaded >= limit_thread && current_thread_target + 1 != this->n_threads){ - n_blocks_loaded = 0; - totempole.byte_offset_end = blocks[i].byte_offset; - thread_workload[current_thread_target] = totempole; - totempole.byte_offset = blocks[i].byte_offset; - ++current_thread_target; - } - ++n_blocks_loaded; - } - - // Add final - if(current_thread_target != this->n_threads){ - totempole.byte_offset_end = blocks.back().byte_offset_end; - thread_workload[this->n_threads-1] = totempole; - } - - std::cerr << Helpers::timestamp("SORT") << "Spawning " << this->n_threads << " threads..." << std::endl; - std::thread** slaves = new std::thread*[this->n_threads]; - slave_sorter** instances = new slave_sorter*[this->n_threads]; - for(U32 i = 0; i < this->n_threads; ++i){ - instances[i] = new slave_sorter(this->reader.writer, toi_writer, memory_limit); - if(!instances[i]->open(input)){ - std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to reopen file..." << std::endl; - exit(1); - } - - // Trigger reverse if applicable - instances[i]->reverseEntries(this->reverse_entries); - } - - for(U32 i = 0; i < this->n_threads; ++i) - slaves[i] = instances[i]->start(thread_workload[i]); - - for(U32 i = 0; i < this->n_threads; ++i) - slaves[i]->join(); - - U32 totempole_blocks_written = 0; - for(U32 i = 0; i < this->n_threads; ++i) - totempole_blocks_written += instances[i]->getBlocksWritten(); - - // TOI - // Update blocks written - // Make sure TOI is flushed before re-opening and seeking - toi_writer.flush(); - - std::fstream re(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX, std::ios::in | std::ios::out | std::ios::binary); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to reopen index..." << std::endl; - return false; - } - - re.seekg(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC_LENGTH + sizeof(float) + sizeof(U64) + sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to seek in index..." << std::endl; - return false; - } - - re.write((char*)&totempole_blocks_written, sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to update counts in index..." << std::endl; - return false; - } - re.close(); - - toi_writer.close(); - - this->reader.writer->flush(); - this->reader.writer->close(); - - // Cleanup - for(U32 i = 0; i < this->n_threads; ++i){ - delete instances[i]; - slaves[i] = nullptr; - } - delete [] instances; - delete [] slaves; - - return true; -} - -bool TomahawkOutputSorter::sortMerge(const std::string& inputFile, const std::string& destinationPrefix, const U32 block_size){ - if(!this->reader.Open(inputFile)){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << inputFile << "..." << std::endl; - return false; - } - - if(!this->reader.hasIndex){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "File is not indexed!" << std::endl; - return false; - } - - this->reader.addLiteral("\n##tomahawk_mergeSortCommand=" + Helpers::program_string()); - - toi_header_type toi_header = this->reader.toi_reader.getHeader(); - if(toi_header.controller.sorted == true){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "File is already sorted!" << std::endl; - return false; - } - - if(toi_header.controller.partial_sort == false){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "File is not partially sorted!" << std::endl; - return false; - } - - toi_header.controller.partial_sort = false; - toi_header.controller.sorted = true; - writer_type writer(this->reader.contigs, &this->reader.header, toi_header); - - if(!writer.open(destinationPrefix)){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open!" << std::endl; - return false; - } - writer.writeHeader(this->reader.literals); - - const U32 n_toi_entries = this->reader.toi_reader.size(); - std::ifstream* streams = new std::ifstream[n_toi_entries]; - tgzf_iterator** iterators = new tgzf_iterator*[n_toi_entries]; - - if(!SILENT) - std::cerr << Helpers::timestamp("LOG", "SORT") << "Opening " << n_toi_entries << " file handles..."; - - for(U32 i = 0; i < n_toi_entries; ++i){ - streams[i].open(inputFile); - streams[i].seekg(this->reader.toi_reader[i].byte_offset); - iterators[i] = new tgzf_iterator(streams[i], 65536, this->reader.toi_reader[i].byte_offset, this->reader.toi_reader[i].byte_offset_end); - } - - if(!SILENT) - std::cerr << " Done!" << std::endl; - - // queue - queue_type outQueue; - - // - if(!SILENT) - std::cerr << Helpers::timestamp("LOG", "SORT") << "Merging..." << std::endl; - - // draw one from each - const entry_type* e = nullptr; - for(U32 i = 0; i < n_toi_entries; ++i){ - if(!iterators[i]->nextEntry(e)){ - std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to get an entry..." << std::endl; - return false; - } - outQueue.push( queue_entry(e, i, IO::Support::TomahawkOutputEntryCompFuncConst) ); - } - - if(outQueue.empty()){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "No data in queue..." << std::endl; - return false; - } - - // while queue is not empty - while(outQueue.empty() == false){ - // peek at top entry in queue - const U32 id = outQueue.top().streamID; - writer << outQueue.top().data; - - // remove this record from the queue - outQueue.pop(); - - - while(iterators[id]->nextEntry(e)){ - if(!(*e < outQueue.top().data)){ - outQueue.push( queue_entry(e, id, IO::Support::TomahawkOutputEntryCompFuncConst) ); - break; - } - writer << *e; - } - } - - writer.flush(); - if(!writer.finalize(toi_header.controller.expanded)){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to finalize index..." << std::endl; - return false; - } - - writer.close(); - - // Temp - //index_type& index = writer.getIndex(); - //std::cerr << index << std::endl; - - // Cleanup - for(U32 i = 0; i < n_toi_entries; ++i) - delete iterators[i]; - - delete [] iterators; - delete [] streams; - - return true; -} - -} -} -} diff --git a/src/algorithm/sort/TomahawkOutputSort.h b/src/algorithm/sort/TomahawkOutputSort.h deleted file mode 100644 index 52758ad..0000000 --- a/src/algorithm/sort/TomahawkOutputSort.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef TOMAHAWKOUTPUTSORT_H_ -#define TOMAHAWKOUTPUTSORT_H_ - -#include -#include - -#include "../../io/compression/TGZFEntryIterator.h" -#include "../../totempole/TotempoleOutputEntry.h" -#include "../../tomahawk/TomahawkOutput/TomahawkOutputReader.h" -#include "../../tomahawk/TomahawkOutput/TomahawkOutputManager.h" -#include "TomahawkOutputSortMergeQueueContainer.h" -#include "TomahawkOutputSortSlave.h" - -namespace Tomahawk{ -namespace Algorithm{ -namespace Output{ - -// Sorter -class TomahawkOutputSorter{ - typedef IO::TomahawkOutputEntry entry_type; - typedef IO::TomahawkOutputEntrySort entry_sort_type; - typedef TomahawkOutputSorter self_type; - typedef TomahawkOutputSortMergeQueueContainer queue_entry; - typedef std::priority_queue< queue_entry > queue_type; // prio queue - typedef IO::TomahawkOutputReader two_reader_type; - typedef Totempole::TotempoleOutputEntry totempole_entry; - typedef IO::TomahawkOutputWriterIndex writer_type; - typedef IO::WriterFile basic_writer_type; - typedef TomahawkOutputSortSlave slave_sorter; - typedef IO::TGZFEntryIterator tgzf_iterator; - typedef Totempole::TotempoleOutputSortedIndex index_type; - typedef Tomahawk::IO::TomahawkOutputSortHeader toi_header_type; - -public: - TomahawkOutputSorter() : n_threads(std::thread::hardware_concurrency()), reverse_entries(true){} - ~TomahawkOutputSorter(){} - - bool sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit); - bool sortMerge(const std::string& input, const std::string& destinationPrefix, const U32 block_size); - -private: - bool __sortUnindexed(); - bool __sortIndexed(basic_writer_type& toi_writer, const std::string& input, U64 memory_limit); - -private: - two_reader_type reader; - -public: - U32 n_threads; - bool reverse_entries; - std::string baseName; - std::string basePath; -}; - - -} -} -} - -#endif /* TOMAHAWKOUTPUTSORT_H_ */ diff --git a/src/algorithm/sort/TomahawkOutputSortSlave.h b/src/algorithm/sort/TomahawkOutputSortSlave.h deleted file mode 100644 index 9d46b6e..0000000 --- a/src/algorithm/sort/TomahawkOutputSortSlave.h +++ /dev/null @@ -1,128 +0,0 @@ -#ifndef TOMAHAWKOUTPUTSORTSLAVE_H_ -#define TOMAHAWKOUTPUTSORTSLAVE_H_ - -#include - -namespace Tomahawk{ -namespace Algorithm{ -namespace Output{ - -class TomahawkOutputSortSlave{ - typedef TomahawkOutputSortSlave self_type; - typedef IO::TomahawkOutputEntry entry_type; - typedef IO::TomahawkOutputEntrySort entry_sort_type; - typedef Totempole::TotempoleOutputEntry totempole_entry; - typedef IO::WriterFile writer_type; - typedef IO::TomahawkOutputReader two_reader_type; - typedef Tomahawk::IO::TomahawkOutputWriterInterface two_writer_interface; - typedef Tomahawk::IO::TomahawkOutputWriter two_writer_type; - typedef IO::TGZFController tgzf_controller_type; - -public: - TomahawkOutputSortSlave(two_writer_interface* writer, writer_type& toi_writer, const U32 memory_limit) : - memory_limit(memory_limit), - blocks_written(0), - writer(reinterpret_cast(writer)), - toi_writer(toi_writer), - reverse_entries(true) - {} - ~TomahawkOutputSortSlave(){} - - inline void reverseEntries(const bool yes = true){ this->reverse_entries = yes; } - - bool open(const std::string& input){ - if(!this->reader.Open(input)){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl; - return false; - } - - return true; - } - - std::thread* start(const totempole_entry& workload){ - this->thread = std::thread(&self_type::sort, this, workload); - return(&this->thread); - } - - inline const U32& getBlocksWritten(void) const{ return(this->blocks_written); } - -private: - bool sort(const totempole_entry& workload){ - bool trigger_break = false; - writer_type& stream = *reinterpret_cast(this->writer->getStream()); - this->reader.stream.seekg(workload.byte_offset); - if(!this->reader.stream.good()){ - std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to seek in file..." << std::endl; - return false; - } - - totempole_entry totempole; - while(true){ - if(this->reader.stream.tellg() == workload.byte_offset_end) - break; - - if(!this->reader.nextBlockUntil(this->memory_limit, workload.byte_offset_end)) - trigger_break = true; - - - if(this->reader.output_buffer.size() == 0){ - trigger_break = true; - break; - } - - assert((this->reader.output_buffer.size() % sizeof(entry_type)) == 0); - - if(this->reverse_entries){ - const entry_type* entry = nullptr; - totempole.entries += 2*((this->reader.output_buffer.size() % sizeof(entry_type))); - while(this->reader.nextVariantLimited(entry)){ - // Flip cA,pA with cB,pB - entry_type temp(entry); - temp.swapDirection(); - this->reader.output_buffer.Add((char*)&temp, sizeof(entry_type)); - } - } else { - // Do not reverse - totempole.entries = (this->reader.output_buffer.size() % sizeof(entry_type)); - } - - std::sort(reinterpret_cast(&this->reader.output_buffer.data[0]), - reinterpret_cast(&this->reader.output_buffer.data[this->reader.output_buffer.size()])); - - totempole.reset(); - totempole.uncompressed_size = this->reader.output_buffer.size(); - - this->controller.Clear(); - this->controller.Deflate(this->reader.output_buffer); - - this->writer->getLock()->lock(); - totempole.byte_offset = stream.getNativeStream().tellp(); - this->writer->getStream()->writeNoLock(this->controller.buffer.data, this->controller.buffer.pointer); - ++this->blocks_written; - totempole.byte_offset_end = stream.getNativeStream().tellp(); - toi_writer.getNativeStream() << totempole; - this->writer->getLock()->unlock(); - - if(trigger_break) break; - } - - return true; - } - -private: - const U32 memory_limit; - U32 blocks_written; - two_writer_type* writer; - writer_type& toi_writer; - two_reader_type reader; - std::thread thread; - tgzf_controller_type controller; - bool reverse_entries; -}; - - -} -} -} - -#endif /* TOMAHAWKOUTPUTSORTSLAVE_H_ */ diff --git a/src/algorithm/sort/TomahawkOutputSortMergeQueueContainer.h b/src/algorithm/sort/output_sort_merge_queue.h similarity index 63% rename from src/algorithm/sort/TomahawkOutputSortMergeQueueContainer.h rename to src/algorithm/sort/output_sort_merge_queue.h index bfc2476..6c5cfed 100644 --- a/src/algorithm/sort/TomahawkOutputSortMergeQueueContainer.h +++ b/src/algorithm/sort/output_sort_merge_queue.h @@ -3,17 +3,16 @@ namespace Tomahawk{ namespace Algorithm{ -namespace Output{ - template -struct TomahawkOutputSortMergeQueueContainer { +struct OutputSortMergeQueue { +private: typedef T entry_type; - typedef TomahawkOutputSortMergeQueueContainer self_type; + typedef OutputSortMergeQueue self_type; public: - TomahawkOutputSortMergeQueueContainer(const entry_type* data, - U32 streamID, - bool (*compFunc)(const entry_type& a, const entry_type& b) = T::operator<) + OutputSortMergeQueue(const entry_type* data, + U32 streamID, + bool (*compFunc)(const entry_type& a, const entry_type& b) = T::operator<) : streamID(streamID) , data(*data) , compFunc(compFunc) @@ -29,7 +28,6 @@ struct TomahawkOutputSortMergeQueueContainer { bool (*compFunc)(const entry_type& a, const entry_type& b); }; -} } } diff --git a/src/algorithm/sort/output_sort_slave.h b/src/algorithm/sort/output_sort_slave.h new file mode 100644 index 0000000..be775b3 --- /dev/null +++ b/src/algorithm/sort/output_sort_slave.h @@ -0,0 +1,120 @@ +#ifndef TOMAHAWKOUTPUTSORTSLAVE_H_ +#define TOMAHAWKOUTPUTSORTSLAVE_H_ + +#include + +#include "../../io/output_writer.h" + +namespace Tomahawk{ +namespace Algorithm{ + +/**< + * Worker slave for partially sorting a (large) `two` file + */ +class OutputSortSlave { +private: + typedef OutputSortSlave self_type; + typedef IO::OutputEntry entry_type; + typedef IO::OutputWriter writer_type; + typedef TomahawkOutputReader reader_type; + typedef IO::TGZFController tgzf_controller_type; + typedef IO::BasicBuffer buffer_type; + typedef IO::TGZFEntryIterator tgzf_iterator; + +public: + OutputSortSlave(reader_type& reader, writer_type& writer, const std::pair& workload, const U32 memory_limit) : + workload_(workload), + n_memory_limit_(memory_limit), + reader_(reader), + writer_(writer) + {} + + ~OutputSortSlave(){ + this->inflate_buffer_.deleteAll(); + this->data_.deleteAll(); + } + + bool open(const std::string& input){ + this->stream_.open(input, std::ios::binary | std::ios::in); + if(this->stream_.good() == false){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl; + return false; + } + + return true; + } + + std::thread* start(void){ + this->thread_ = std::thread(&self_type::sort, this); + return(&this->thread_); + } + + inline const writer_type& getWriter(void) const{ return(this->writer_); } + +private: + bool sort(void){ + if(!this->stream_.seekg(this->workload_.first)){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to seek to block!" << std::endl; + exit(1); // exit instead of return because of detached threads + } + + // iterator + const U64 n_entries_limit = this->n_memory_limit_ / sizeof(entry_type); + tgzf_iterator it(this->stream_, 65536, this->workload_.first, this->workload_.second); + bool finished_ = false; + + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "Stream is bad!" << std::endl; + exit(1); // exit instead of return because of detached threads + } + + while(true){ + OutputContainer container(this->n_memory_limit_ / sizeof(entry_type) + 1024); + + const entry_type* e = nullptr; + for(U32 i = 0; i < n_entries_limit; ++i){ + if(!it.nextEntry(e)){ + finished_ = true; + break; + } + container += *e; + } + + std::sort(&container.front(), &container.back()); + + const entry_type* prev = &container[0]; + for(size_t j = 1; j < container.size(); ++j){ + if(*prev >= container[j]){ + std::cerr << j-1 << ',' << j << std::endl; + std::cerr << *prev << std::endl; + std::cerr << container[j] << std::endl; + exit(1); + } + prev = &container[j]; + } + + this->writer_ << container; + if(finished_) + break; + } + + return true; + } + +private: + std::ifstream stream_; + std::pair workload_; + const U32 n_memory_limit_; + const reader_type& reader_; + writer_type writer_; + std::thread thread_; + buffer_type inflate_buffer_; + buffer_type data_; + tgzf_controller_type compression_manager_; +}; + + +} +} + +#endif /* TOMAHAWKOUTPUTSORTSLAVE_H_ */ diff --git a/src/algorithm/sort/output_sorter.cpp b/src/algorithm/sort/output_sorter.cpp new file mode 100644 index 0000000..c9f28de --- /dev/null +++ b/src/algorithm/sort/output_sorter.cpp @@ -0,0 +1,206 @@ +#include + +#include "output_sorter.h" + +namespace Tomahawk{ +namespace Algorithm{ + +// Algorithmic sketch: +// 1: Load data into balanced chunks of memory_limit bytes +// 2: Load partitioned data into containers +// 3: Perform sort +// 4: Perform merge if desired +bool OutputSorter::sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit){ + if(!this->reader.open(input)){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl; + return false; + } + + if(this->reader.getIndex().getController().isSorted){ + std::cerr << Helpers::timestamp("LOG","SORT") << "File is already sorted..." << std::endl; + return true; + } + + //std::cerr << this->reader.getIndex().totalBytes() << "->" << this->reader.getIndex().totalBytes() / this->n_threads << std::endl; + const U64 n_variants_chunk = this->reader.getIndex().totalBytes() / this->n_threads; + + std::pair* thread_distribution = new std::pair[this->n_threads]; + size_t i = 0; + U32 t = 0; + for(; t < this->n_threads; ++t){ + U64 partition_size = 0; + const size_t from = i; + for(; i < this->reader.getIndex().size(); ++i){ + partition_size += this->reader.getIndex().getContainer().at(i).sizeBytes(); + if(partition_size >= n_variants_chunk && t + 1 != this->n_threads){ + ++i; + break; + } + } + thread_distribution[t].first = this->reader.getIndex().getContainer().at(from).byte_offset; + thread_distribution[t].second = this->reader.getIndex().getContainer().at(i).byte_offset; + + if(i == this->reader.getIndex().size()){ + //std::cerr << "ran out of data" << std::endl; + thread_distribution[t].second = this->reader.getIndex().getContainer().back().byte_offset_end; + //std::cerr << "t: " << from << "->" << this->reader.getIndex().getContainer().size() << " (" << thread_distribution[t].first << "->" << thread_distribution[t].second << ") for " << partition_size << "/" << n_variants_chunk << std::endl; + ++t; + break; + } + //std::cerr << "t: " << from << "->" << i << " (" << thread_distribution[t].first << "->" << thread_distribution[t].second << ") for " << partition_size << "/" << n_variants_chunk << std::endl; + } + const U32 active_threads = t; + + // Append executed command to literals + this->reader.getHeader().getLiterals() += "\n##tomahawk_sortCommand=" + Helpers::program_string(); + + IO::OutputWriter writer; + if(!writer.open(destinationPrefix)){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << destinationPrefix << "..." << std::endl; + return false; + } + writer.writeHeaders(this->reader.getHeader()); + + if(!SILENT) + std::cerr << Helpers::timestamp("LOG","SORT") << "Spawning: " << active_threads << " workers..." << std::endl; + + OutputSortSlave** slaves = new OutputSortSlave*[active_threads]; + std::thread** threads = new std::thread*[active_threads]; + + for(U32 i = 0; i < active_threads; ++i){ + slaves[i] = new OutputSortSlave(this->reader, writer, thread_distribution[i], memory_limit); + if(!slaves[i]->open(input)){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl; + return false; + } + threads[i] = slaves[i]->start(); + } + + for(U32 i = 0; i < active_threads; ++i) + threads[i]->join(); + + for(U32 i = 0; i < active_threads; ++i) + writer += slaves[i]->getWriter(); + + writer.setSorted(false); + writer.setPartialSorted(true); + writer.flush(); + writer.writeFinal(); + + if(!SILENT) + std::cerr << Helpers::timestamp("LOG") << "Output: " << Helpers::ToPrettyString(writer.sizeEntries()) << " entries into " << Helpers::ToPrettyString(writer.sizeBlocks()) << " blocks..." << std::endl; + + for(U32 i = 0; i < active_threads; ++i) + delete slaves[i]; + + delete [] slaves; + delete [] threads; + delete [] thread_distribution; + + return true; +} + +bool OutputSorter::sortMerge(const std::string& inputFile, const std::string& destinationPrefix, const U32 block_size){ + if(!this->reader.open(inputFile)){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << inputFile << "..." << std::endl; + return false; + } + + if(this->reader.getIndex().getController().isSorted){ + std::cerr << Helpers::timestamp("LOG","SORT") << "File is already sorted..." << std::endl; + return true; + } + + if(this->reader.getIndex().getController().isPartialSorted == false){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "File is not partially sorted..." << std::endl; + return false; + } + + // Append executed command to literals + this->reader.getHeader().getLiterals() += "\n##tomahawk_mergeSortCommand=" + Helpers::program_string(); + + IO::OutputWriter writer; + if(!writer.open(destinationPrefix)){ + std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to open: " << destinationPrefix << "..." << std::endl; + return false; + } + writer.setFlushLimit(block_size); + writer.writeHeaders(this->reader.getHeader()); + + const U32 n_toi_entries = this->reader.getIndex().size(); + std::ifstream* streams = new std::ifstream[n_toi_entries]; + tgzf_iterator** iterators = new tgzf_iterator*[n_toi_entries]; + + if(!SILENT) + std::cerr << Helpers::timestamp("LOG", "SORT") << "Opening " << n_toi_entries << " file handles..."; + + for(U32 i = 0; i < n_toi_entries; ++i){ + streams[i].open(inputFile); + streams[i].seekg(this->reader.getIndex().getContainer()[i].byte_offset); + iterators[i] = new tgzf_iterator(streams[i], 65536, this->reader.getIndex().getContainer()[i].byte_offset, this->reader.getIndex().getContainer()[i].byte_offset_end); + } + + if(!SILENT) + std::cerr << " Done!" << std::endl; + + // queue + queue_type outQueue; + + // + if(!SILENT) + std::cerr << Helpers::timestamp("LOG", "SORT") << "Merging..." << std::endl; + + // draw one from each + const entry_type* e = nullptr; + for(U32 i = 0; i < n_toi_entries; ++i){ + if(!iterators[i]->nextEntry(e)){ + std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to get an entry..." << std::endl; + return false; + } + outQueue.push( queue_entry(e, i, entry_type::sortAscending) ); + } + + if(outQueue.empty()){ + std::cerr << Helpers::timestamp("ERROR","SORT") << "No data in queue..." << std::endl; + return false; + } + + // while queue is not empty + while(outQueue.empty() == false){ + // peek at top entry in queue + const U32 id = outQueue.top().streamID; + writer << outQueue.top().data; + + // remove this record from the queue + outQueue.pop(); + + + while(iterators[id]->nextEntry(e)){ + if(!(*e < outQueue.top().data)){ + outQueue.push( queue_entry(e, id, entry_type::sortAscending) ); + break; + } + writer << *e; + } + } + + writer.setPartialSorted(false); + writer.setSorted(true); + writer.flush(); + writer.writeFinal(); + + if(!SILENT) + std::cerr << Helpers::timestamp("LOG") << "Output: " << Helpers::ToPrettyString(writer.sizeEntries()) << " entries into " << Helpers::ToPrettyString(writer.sizeBlocks()) << " blocks..." << std::endl; + + // Cleanup + for(U32 i = 0; i < n_toi_entries; ++i) + delete iterators[i]; + + delete [] iterators; + delete [] streams; + + return true; +} + +} +} diff --git a/src/algorithm/sort/output_sorter.h b/src/algorithm/sort/output_sorter.h new file mode 100644 index 0000000..bceb56d --- /dev/null +++ b/src/algorithm/sort/output_sorter.h @@ -0,0 +1,61 @@ +#ifndef TOMAHAWKOUTPUTSORT_H_ +#define TOMAHAWKOUTPUTSORT_H_ + +#include +#include + +#include "../../io/compression/TGZFEntryIterator.h" +#include "../../tomahawk/two/TomahawkOutputReader.h" +#include "output_sort_merge_queue.h" +#include "output_sort_slave.h" + +namespace Tomahawk{ +namespace Algorithm{ + +/**< + * Primary class for sorting `TWO` data + */ +class OutputSorter{ + typedef IO::OutputEntry entry_type; + typedef TomahawkOutputReader two_reader_type; + typedef IO::TGZFEntryIterator tgzf_iterator; + typedef OutputSorter self_type; + typedef OutputSortMergeQueue queue_entry; + typedef std::priority_queue queue_type; // priority queue + +public: + OutputSorter() : n_threads(std::thread::hardware_concurrency()){} + ~OutputSorter(){} + + /**< + * Standard sorting approach + * @param input + * @param destinationPrefix + * @param memory_limit + * @return + */ + bool sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit); + + /**< + * N-way merge of paralell-sorted blocks + * @param input + * @param destinationPrefix + * @param block_size + * @return + */ + bool sortMerge(const std::string& input, const std::string& destinationPrefix, const U32 block_size); + + inline const size_t size(void) const{ return(this->n_threads); } + +private: + two_reader_type reader; + +public: + size_t n_threads; +}; + + +} +} + +#endif /* TOMAHAWKOUTPUTSORT_H_ */ diff --git a/src/calc.h b/src/calc.h index dc0ce6a..b78caa1 100644 --- a/src/calc.h +++ b/src/calc.h @@ -231,6 +231,9 @@ int calc(int argc, char** argv){ return 1; } + //Tomahawk::Totempole::TotempoleReader totempole_reader; + //Tomahawk::TomahawkReader tomahawk_reader; + //std::vector rets = totempole.findOverlaps(Tomahawk::Interval(0, 2221297, 10108169)); //std::cerr << "Found overlaps: " << rets.size() << std::endl; //for(U32 i = 0; i < rets.size(); ++i) diff --git a/src/concat.h b/src/concat.h index f8ed2b1..d1a02d3 100644 --- a/src/concat.h +++ b/src/concat.h @@ -22,8 +22,8 @@ DEALINGS IN THE SOFTWARE. */ #include +#include "tomahawk/two/TomahawkOutputReader.h" #include "utility.h" -#include "tomahawk/TomahawkOutput/TomahawkOutputReader.h" void concat_usage(void){ programMessage(); @@ -108,17 +108,21 @@ int concat(int argc, char** argv){ std::cerr << Tomahawk::Helpers::timestamp("LOG") << "Calling concat..." << std::endl; } - Tomahawk::IO::TomahawkOutputReader reader; + Tomahawk::TomahawkOutputReader reader; if(input.size() == 0){ + /* if(!reader.concat(files, output)){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "CONCAT") << "Failed to concat files!" << std::endl; return 1; } + */ } else { + /* if(!reader.concat(input, output)){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "CONCAT") << "Failed to concat files!" << std::endl; return 1; } + */ } return 0; diff --git a/src/import.h b/src/import.h index a5c3160..8a9b7d5 100644 --- a/src/import.h +++ b/src/import.h @@ -151,9 +151,10 @@ int import(int argc, char** argv){ std::cerr << Tomahawk::Helpers::timestamp("LOG") << "Calling import..." << std::endl; } + Tomahawk::TomahawkImporter importer(input, output); - importer.getFilters().HWE_P = hwe_p; - importer.getFilters().MAF = maf; + importer.getFilters().HWE_P = hwe_p; + importer.getFilters().MAF = maf; importer.getFilters().missingness = missingness; if(!extension_mode){ diff --git a/src/index.h b/src/index.h deleted file mode 100644 index 9e708c2..0000000 --- a/src/index.h +++ /dev/null @@ -1,67 +0,0 @@ -/* -Copyright (C) 2016-2017 Genome Research Ltd. -Author: Marcus D. R. Klarqvist - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ -#include "utility.h" -#include "tomahawk/TomahawkReader.h" -#include "totempole/TotempoleReader.h" -#include "tomahawk/TomahawkOutput/TomahawkOutputReader.h" - -int index(int argc, char** argv){ - argc -= 2; argv += 2; - programMessage(); - std::cerr << Tomahawk::Helpers::timestamp("LOG") << "Calling index..." << std::endl; - - if(argc < 1){ - std::cerr << argc << std::endl; - std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Missing parameters" << std::endl; - return(1); - } - - std::string inputFile(&argv[0][0]); - - // Parse file suffix - std::vector paths = Tomahawk::Helpers::splitLastOf(inputFile, '/', true); - std::vector files = Tomahawk::Helpers::splitLastOf(paths[1], '.'); - - // Todo: if failed to read from file suffix: try to look into file header MAGIC - if(files[1].size() == 0){ - std::cerr << "could not determine file type from suffix" << std::endl; - return false; - } - - std::transform(files[1].begin(), files[1].end(), files[1].begin(), ::tolower); - - if(files[1] == Tomahawk::Constants::OUTPUT_SUFFIX){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR","INDEX") << "Twk files are already indexed..." << std::endl; - } else if(files[1] == Tomahawk::Constants::OUTPUT_LD_SUFFIX) { - Tomahawk::IO::TomahawkOutputReader reader; - - if(!reader.index(inputFile)){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "INDEX") << "Failed to index file!" << std::endl; - return 1; - } - } else { - std::cerr << "Unknown file type" << std::endl; - } - - return 0; -} diff --git a/src/index/footer.h b/src/index/footer.h new file mode 100644 index 0000000..e11da60 --- /dev/null +++ b/src/index/footer.h @@ -0,0 +1,73 @@ +#ifndef TOTEMPOLEHEADER_H_ +#define TOTEMPOLEHEADER_H_ + +#include + +#include "../support/MagicConstants.h" + +namespace Tomahawk { +namespace Totempole { + +#define TWK_FOOTER_LENGTH (Constants::eof_length + sizeof(U32) + sizeof(U64)) + +struct Footer{ +public: + typedef Footer self_type; + +public: + Footer() : + offset_end_of_data(0), + l_largest_uncompressed(0) + { + Helpers::HexToBytes(Constants::eof_hex, &this->EOF_marker[0]); + } + + Footer(const char* const data) : + offset_end_of_data(*reinterpret_cast(data)), + l_largest_uncompressed(*reinterpret_cast(&data[sizeof(U64)])) + { + memcpy(&this->EOF_marker[0], &data[sizeof(U64)+sizeof(U32)], Constants::eof_length); + } + + ~Footer() = default; + + inline const U64& getEODPosition(void) const{ return(this->offset_end_of_data); } + inline const U32& getLargestUncompressedBlock(void) const{ return(this->l_largest_uncompressed); } + inline U64& getEODPosition(void){ return(this->offset_end_of_data); } + inline U32& getLargestUncompressedBlock(void){ return(this->l_largest_uncompressed); } + + inline const bool validate(void) const{ + if(this->offset_end_of_data == 0) return false; + if(this->l_largest_uncompressed == 0) return false; + + BYTE reference[Constants::eof_length]; + Helpers::HexToBytes(Constants::eof_hex, &reference[0]); + + if(strncmp(reinterpret_cast(&this->EOF_marker[0]), reinterpret_cast(&reference[0]), Constants::eof_length) != 0) return false; + return true; + } + + friend std::ostream& operator<<(std::ostream& stream, const self_type& footer){ + stream.write(reinterpret_cast(&footer.offset_end_of_data), sizeof(U64)); + stream.write(reinterpret_cast(&footer.l_largest_uncompressed), sizeof(U32)); + stream.write(reinterpret_cast(&footer.EOF_marker[0]), Constants::eof_length); + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, self_type& footer){ + stream.read(reinterpret_cast(&footer.offset_end_of_data), sizeof(U64)); + stream.read(reinterpret_cast(&footer.l_largest_uncompressed), sizeof(U32)); + stream.read(reinterpret_cast(&footer.EOF_marker[0]), Constants::eof_length); + return(stream); + } + +public: + U64 offset_end_of_data; // number of blocks in Tomahawk + U32 l_largest_uncompressed; // largest block-size in bytes + BYTE EOF_marker[Constants::eof_length]; +}; + +} +} + +#endif /* TOTEMPOLEHEADER_H_ */ diff --git a/src/index/index.cpp b/src/index/index.cpp new file mode 100644 index 0000000..d88f79a --- /dev/null +++ b/src/index/index.cpp @@ -0,0 +1,67 @@ +#include "index.h" + +namespace Tomahawk{ + +Index::Index(){} +Index::~Index(){} + +// Reading an index from a byte stream +Index::Index(const char* const data, const U32 l_data) : + controller_(data), + meta_container_(&data[sizeof(BYTE)], *reinterpret_cast(&data[sizeof(BYTE)])*TWK_INDEX_META_ENTRY_SIZE+sizeof(size_type)), + container_(&data[sizeof(BYTE) + this->meta_container_.size() * TWK_INDEX_META_ENTRY_SIZE + sizeof(size_type)], l_data - (this->meta_container_.size() * TWK_INDEX_META_ENTRY_SIZE + sizeof(size_type) + sizeof(BYTE))) +{ + +} + +bool Index::buildMetaIndex(const U32 n_contigs){ + if(this->getContainer().size() == 0) + return false; + + if(this->isSorted() == false) + return false; + + meta_entry_type reference_entry; + reference_entry.index_begin = 0; + reference_entry.index_end = 1; + reference_entry.min_position = this->getContainer()[0].min_position; + reference_entry.max_position = this->getContainer()[0].max_position; + reference_entry.n_variants = this->getContainer()[0].n_variants; + reference_entry.uncompressed_size = this->getContainer()[0].uncompressed_size; + U32 reference_contig = this->getContainer()[0].contigID; + + meta_entry_type* temp_entries = new meta_entry_type[n_contigs]; + + for(U32 i = 1; i < this->getContainer().size(); ++i){ + if(this->getContainer()[i].contigID != reference_contig){ + if(this->getContainer()[i].contigID < reference_contig) + continue; + + temp_entries[reference_contig] = reference_entry; + reference_contig = this->getContainer()[i].contigID; + reference_entry.index_begin = i; + reference_entry.index_end = i + 1; + reference_entry.min_position = this->getContainer()[i].min_position; + reference_entry.max_position = this->getContainer()[i].max_position; + reference_entry.n_variants = this->getContainer()[i].n_variants; + reference_entry.uncompressed_size = this->getContainer()[i].uncompressed_size; + + } else { + ++reference_entry.index_end; + reference_entry.max_position = this->getContainer()[i].max_position; + reference_entry.n_variants += this->getContainer()[i].n_variants; + reference_entry.uncompressed_size += this->getContainer()[i].uncompressed_size; + } + } + temp_entries[reference_contig] = reference_entry; + + for(U32 i = 0; i < n_contigs; ++i){ + //std::cerr << temp_entries[i] << std::endl; + this->meta_container_ += temp_entries[i]; + } + delete [] temp_entries; + + return(true); +} + +} diff --git a/src/index/index.h b/src/index/index.h new file mode 100644 index 0000000..aa58d1c --- /dev/null +++ b/src/index/index.h @@ -0,0 +1,128 @@ +#ifndef INDEX_INDEX_H_ +#define INDEX_INDEX_H_ + +#include "index_contig.h" +#include "index_container.h" +#include "index_meta_container.h" +#include "../io/BasicBuffer.h" +#include "footer.h" + +namespace Tomahawk{ + +/**< + * Index controller for bit flags + */ +struct IndexController{ +public: + typedef IndexController self_type; + +public: + IndexController() : + isSorted(false), + isPartialSorted(false), + unused(0) + {} + + IndexController(const char* const data){ memcpy(this, data, sizeof(BYTE)); } + +private: + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& controller){ + stream.write((const char*)&controller, sizeof(BYTE)); + return stream; + } + + friend std::ifstream& operator>>(std::ifstream& stream, self_type& controller){ + stream.read((char*)&controller, sizeof(BYTE)); + return stream; + } + +public: + BYTE isSorted: 1, + isPartialSorted: 1, + unused: 6; +}; + +/**< + * This container handles the index entries for `twk` blocks: their + * start and end IO positions and what genomic regions they cover. + * The value type of this container are containers of entries. + */ +class Index{ +private: + typedef Index self_type; + typedef Totempole::Footer footer_type; + typedef Totempole::IndexEntry value_type; + typedef Totempole::IndexContainer container_type; + typedef Totempole::IndexMetaContainer meta_container_type; + typedef Totempole::IndexMetaEntry meta_entry_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef IO::BasicBuffer buffer_type; + typedef IndexController controller_type; + +public: + Index(); + ~Index(); + + // Reading an index from a byte stream + Index(const char* const data, const U32 l_data); + + // Capacity + inline const size_type& size(void) const{ return(this->container_.size()); } + inline const size_type& sizeMeta(void) const{ return(this->meta_container_.size()); } + + // Accessors + inline container_type& getContainer(void){ return(this->container_); } + inline const container_type& getContainer(void) const{ return(this->container_); } + inline meta_container_type& getMetaContainer(void){ return(this->meta_container_); } + inline const meta_container_type& getMetaContainer(void) const{ return(this->meta_container_); } + inline controller_type& getController(void){ return(this->controller_); } + inline const controller_type& getController(void) const{ return(this->controller_); } + + // Setters + inline void setSorted(const bool yes){ this->controller_.isSorted = yes; } + inline void setPartialSorted(const bool yes){ this->controller_.isPartialSorted = yes; } + + // Getters + inline const bool isSorted(void) const{ return(this->controller_.isSorted); } + inline const bool isPartialSorted(void) const{ return(this->controller_.isPartialSorted); } + inline const U64 totalBytes(void) const{ + U64 total_bytes = 0; + for(size_t i = 0; i < this->getContainer().size(); ++i) + total_bytes += this->getContainer().at(i).sizeBytes(); + + return(total_bytes); + } + + // Overloaded + inline void operator<<(const_reference entry){ this->container_ += entry; } + inline void operator+=(const_reference entry){ this->container_ += entry; } + + /**< + * Constructs the index of index if the data is sorted + * @param n_contigs Number of contigs in the file + * @return Returns TRUE upon success or FALSE otherwise + */ + bool buildMetaIndex(const U32 n_contigs); + +private: + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& index){ + stream << index.getController(); + stream << index.getMetaContainer(); + stream << index.getContainer(); + return(stream); + } + +private: + controller_type controller_; + meta_container_type meta_container_; + container_type container_; +}; + +} + +#endif /* INDEX_INDEX_H_ */ diff --git a/src/index/index_container.cpp b/src/index/index_container.cpp new file mode 100644 index 0000000..7bbdf40 --- /dev/null +++ b/src/index/index_container.cpp @@ -0,0 +1,143 @@ +#include "index_container.h" + +namespace Tomahawk{ +namespace Totempole{ + +IndexContainer::IndexContainer(void) : + n_entries_(0), + n_capacity_(1000), + entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) +{ + +} + +IndexContainer::IndexContainer(const size_t n_capacity_) : + n_entries_(0), + n_capacity_(n_capacity_), + entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) +{ + +} + +// Functions for when interpreting from a byte stream +// first value is the number of indices +IndexContainer::IndexContainer(const char* const data_buffer, const U32 l_data) : + n_entries_(*reinterpret_cast(data_buffer)), + n_capacity_(this->n_entries_), + entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) +{ + U32 cumulative_position = sizeof(size_type); + for(U32 i = 0; i < this->size(); ++i){ + new( &this->entries_[i] ) value_type(&data_buffer[cumulative_position]); + cumulative_position += TWK_INDEX_ENTRY_SIZE; + } + assert(cumulative_position == l_data); +} + +IndexContainer::~IndexContainer(){ + for(size_type i = 0; i < this->size(); ++i) + ((this->entries_ + i)->~IndexEntry)(); + + ::operator delete[](static_cast(this->entries_)); +} + +IndexContainer& IndexContainer::operator+=(const value_type& index_entry){ + if(this->size() + 1 >= this->capacity()){ + //std::cerr << "is full resizing" << std::endl; + this->resize(); + } + + //std::cerr << Helpers::timestamp("DEBUG") << "Adding: " << this->size() << "/" << this->capacity() << std::endl; + new( &this->entries_[this->n_entries_] ) value_type(index_entry); // invoke copy ctor + ++this->n_entries_; + return(*this); +} + +void IndexContainer::resize(const size_t new_capacity){ + //std::cerr << Helpers::timestamp("DEBUG") << "Resize: " << this->capacity() << "->" << new_capacity << std::endl; + // if resizing to a smaller size + if(new_capacity < this->capacity()){ + // Call destructor for values between shrunk size and previous numbers + for(size_type i = new_capacity; i < this->size(); ++i) + ((this->entries_ + i)->~IndexEntry)(); + + this->n_entries_ = new_capacity; + return; + } + + pointer temp = this->entries_; // Move current data pointer + this->entries_ = static_cast(::operator new[](new_capacity*sizeof(value_type))); // Allocate new memory at old pointer + // Copy data over from temporary data pointer to new pointer + for(U32 i = 0; i < this->size(); ++i) + new( &this->entries_[i] ) value_type(temp[i]); + + // Release memory from the temporary address + for(size_type i = 0; i < this->size(); ++i) + ((temp + i)->~IndexEntry)(); + + ::operator delete[](static_cast(temp)); + this->n_capacity_ = new_capacity; +} + +std::pair IndexContainer::findOverlap(const S32& contigID) const{ + // Find first hit + size_t i = 0; + for(; i < this->size(); ++i){ + if(this->at(i).overlaps(contigID)) + break; + } + + if(i == this->size()) + return(std::pair(&this->at(this->size()), &this->at(this->size()))); + + const size_t from = i; + for(; i < this->size(); ++i){ + if(this->at(i).overlaps(contigID) == false) + break; + } + + return(std::pair(&this->at(from), &this->at(i))); +} + +std::pair IndexContainer::findOverlap(const S32& contigID, const U64& position) const{ + // Find first hit + size_t i = 0; + for(; i < this->size(); ++i){ + if(this->at(i).overlaps(contigID, position)) + break; + } + + if(i == this->size()) + return(std::pair(&this->at(this->size()), &this->at(this->size()))); + + const size_t from = i; + for(; i < this->size(); ++i){ + if(this->at(i).overlaps(contigID, position) == false) + break; + } + + return(std::pair(&this->at(from), &this->at(i))); +} + +std::pair IndexContainer::findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const{ + // Find first hit + size_t i = 0; + for(; i < this->size(); ++i){ + if(this->at(i).overlaps(contigID, from_position, to_position)) + break; + } + + if(i == this->size()) + return(std::pair(&this->at(this->size()), &this->at(this->size()))); + + const size_t from = i; + for(; i < this->size(); ++i){ + if(this->at(i).overlaps(contigID, from_position, to_position) == false) + break; + } + + return(std::pair(&this->at(from), &this->at(i))); +} + +} +} diff --git a/src/index/index_container.h b/src/index/index_container.h new file mode 100644 index 0000000..8951d26 --- /dev/null +++ b/src/index/index_container.h @@ -0,0 +1,128 @@ +#ifndef INDEX_INDEX_CONTAINER_H_ +#define INDEX_INDEX_CONTAINER_H_ + +#include // assert +#include // size_t, ptrdiff_t +#include // forward_iterator_tag + +#include "../support/type_definitions.h" +#include "../io/BasicBuffer.h" +#include "index_entry.h" + +namespace Tomahawk{ +namespace Totempole{ + +/**< + * STL-like container for Tomahawk index entries + */ +class IndexContainer{ +private: + typedef IndexContainer self_type; + typedef IndexEntry value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef IO::BasicBuffer buffer_type; + +public: + IndexContainer(void); + IndexContainer(const size_t n_capacity_); + + // Functions for when interpreting from a byte stream + // first value is the number of indices + IndexContainer(const char* const data_buffer, const U32 l_data); + ~IndexContainer(); + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->entries_[position]); } + inline const_reference at(const size_type& position) const{ return(this->entries_[position]); } + inline reference operator[](const size_type& position){ return(this->entries_[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->entries_[position]); } + inline pointer data(void){ return(this->entries_); } + inline const_pointer data(void) const{ return(this->entries_); } + inline reference front(void){ return(this->entries_[0]); } + inline const_reference front(void) const{ return(this->entries_[0]); } + inline reference back(void){ return(this->entries_[this->n_entries_ - 1]); } + inline const_reference back(void) const{ return(this->entries_[this->n_entries_ - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries_ == 0); } + inline const size_type& size(void) const{ return(this->n_entries_); } + inline const size_type& capacity(void) const{ return(this->n_capacity_); } + + void resize(const size_t new_capacity); + inline void resize(void){ this->resize(this->capacity()*2); } + + // Iterator + inline iterator begin(){ return iterator(&this->entries_[0]); } + inline iterator end() { return iterator(&this->entries_[this->n_entries_]); } + inline const_iterator begin() const{ return const_iterator(&this->entries_[0]); } + inline const_iterator end() const{ return const_iterator(&this->entries_[this->n_entries_]); } + inline const_iterator cbegin() const{ return const_iterator(&this->entries_[0]); } + inline const_iterator cend() const{ return const_iterator(&this->entries_[this->n_entries_]); } + + // Overload basic operator + self_type& operator+=(const value_type& index_entry); + + // Overlap functions: find blocks a target interval overlaps + // returns pairs of pointers. If pointerA == pointerB then the data is empty + std::pair findOverlap(const S32& contigID) const; + std::pair findOverlap(const S32& contigID, const U64& position) const; + std::pair findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const; + +private: + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& container){ + stream.write(reinterpret_cast(&container.n_entries_), sizeof(size_type)); + for(size_type i = 0; i < container.size(); ++i) + stream << container[i]; + + return stream; + } + +private: + size_type n_entries_; + size_type n_capacity_; + pointer entries_; +}; + +} +} + +#endif /* INDEX_INDEX_CONTAINER_H_ */ diff --git a/src/index/index_contig.h b/src/index/index_contig.h new file mode 100644 index 0000000..4f9e08b --- /dev/null +++ b/src/index/index_contig.h @@ -0,0 +1,135 @@ +#ifndef TOTEMPOLECONTIG_H_ +#define TOTEMPOLECONTIG_H_ + +#include +#include + +#include "../support/type_definitions.h" +#include "../io/BasicBuffer.h" + +namespace Tomahawk{ +namespace Totempole{ + +struct HeaderContig{ +public: + typedef HeaderContig self_type; + typedef IO::BasicBuffer buffer_type; + +public: + HeaderContig(const U32& bases, const U32& n_char, const std::string& name) : + n_bases(bases), + n_char(n_char), + name(name) + {} + + HeaderContig() : n_bases(0), n_char(0){} + + HeaderContig(const char* const data) : + n_bases(*reinterpret_cast(data)), + n_char(*reinterpret_cast(&data[sizeof(U32)])) + { + this->name.resize(this->n_char); + memcpy(&this->name[0], &data[sizeof(U32)+sizeof(U32)], this->n_char); + } + + ~HeaderContig(){} + + const U32 interpret(const char* const data){ + this->n_bases = *reinterpret_cast(data); + this->n_char = *reinterpret_cast(&data[sizeof(U32)]); + this->name.resize(this->n_char); + memcpy(&this->name[0], &data[sizeof(U32)+sizeof(U32)], this->n_char); + return(sizeof(U32) + sizeof(U32) + this->n_char); + } + + const U32 interpret(const U32& bases, const U32& n_char, const std::string& name){ + this->n_bases = bases; + this->n_char = n_char; + this->name = name; + return(sizeof(U32) + sizeof(U32) + this->n_char); + } + + friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ + stream << entry.n_bases << '\t' << entry.n_char << '\t' << entry.name; + return stream; + } + + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& base){ + stream.write(reinterpret_cast(&base.n_bases), sizeof(U32)); + stream.write(reinterpret_cast(&base.n_char), sizeof(U32)); + stream.write(reinterpret_cast(&base.name[0]), base.name.size()); + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, self_type& base){ + stream.read(reinterpret_cast(&base.n_bases), sizeof(U32)); + stream.read(reinterpret_cast(&base.n_char), sizeof(U32)); + base.name.resize(base.n_char); + stream.read(&base.name[0], base.n_char); + return(stream); + } + + friend buffer_type& operator+=(buffer_type& buffer, self_type& base){ + buffer += base.n_bases; + buffer += base.n_char; + buffer.Add(base.name.data(), base.name.size()); + return(buffer); + } + +public: + U32 n_bases; // length of contig + U32 n_char; // number of chars + std::string name; // contig name +}; + +struct IndexContig : public HeaderContig{ +public: + typedef IndexContig self_type; + typedef HeaderContig parent_type; + +public: + IndexContig() : min_position(0), max_position(0), blocks_start(0), blocks_end(0){} + ~IndexContig(){} + + friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ + stream << entry.name << '\t' << entry.n_bases << '\t' << entry.min_position << "-" << entry.max_position << '\t' << entry.blocks_start << "->" << entry.blocks_end; + return stream; + } + + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& base){ + const parent_type* const parent = reinterpret_cast(&base); + stream << *parent; + + stream.write(reinterpret_cast(&base.min_position), sizeof(U32)); + stream.write(reinterpret_cast(&base.max_position), sizeof(U32)); + stream.write(reinterpret_cast(&base.blocks_start), sizeof(U32)); + stream.write(reinterpret_cast(&base.blocks_end), sizeof(U32)); + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, self_type& base){ + parent_type* parent = reinterpret_cast(&base); + stream >> *parent; + + stream.read(reinterpret_cast(&base.min_position), sizeof(U32)); + stream.read(reinterpret_cast(&base.max_position), sizeof(U32)); + stream.read(reinterpret_cast(&base.blocks_start), sizeof(U32)); + stream.read(reinterpret_cast(&base.blocks_end), sizeof(U32)); + return(stream); + } + +public: + // contigID is implicit + U32 min_position; // start position of contig + U32 max_position; // end position of contig + U32 blocks_start; // start IO-seek position of blocks + U32 blocks_end; // end IO-seek position of blocks +}; + +} +} + + + + +#endif /* TOTEMPOLECONTIG_H_ */ diff --git a/src/index/index_entry.h b/src/index/index_entry.h new file mode 100644 index 0000000..4478271 --- /dev/null +++ b/src/index/index_entry.h @@ -0,0 +1,119 @@ +#ifndef TOTEMPOLEENTRY_H_ +#define TOTEMPOLEENTRY_H_ + +#include + +namespace Tomahawk{ +namespace Totempole{ + +#define TWK_INDEX_ENTRY_SIZE (sizeof(U64)*4 + sizeof(S32) + sizeof(U32)*2) + +struct IndexEntry{ +public: + typedef IndexEntry self_type; + +public: + IndexEntry() : + byte_offset(0), + byte_offset_end(0), + contigID(0), + min_position(0), + max_position(0), + n_variants(0), + uncompressed_size(0) + { + } + + IndexEntry(const char* const data) : + byte_offset(*reinterpret_cast(data)), + byte_offset_end(*reinterpret_cast(&data[sizeof(U64)])), + contigID(*reinterpret_cast(&data[sizeof(U64)*2])), + min_position(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)])), + max_position(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)+sizeof(U64)])), + n_variants(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)+sizeof(U64)*2])), + uncompressed_size(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)+sizeof(U64)*2+sizeof(U32)])) + { + } + + // Copy ctor + IndexEntry(const self_type& other) : + byte_offset(other.byte_offset), + byte_offset_end(other.byte_offset_end), + contigID(other.contigID), + min_position(other.min_position), + max_position(other.max_position), + n_variants(other.n_variants), + uncompressed_size(other.uncompressed_size) + { + } + ~IndexEntry() = default; + + inline U32 size(void) const{ return(this->n_variants); } + inline bool isValid(void) const{ return(this->byte_offset != 0); } + inline void operator++(void){ ++this->n_variants; } + inline U64 sizeBytes(void) const{ return(this->byte_offset_end - this->byte_offset); } + + friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ + stream << entry.byte_offset << '\t' << entry.byte_offset_end << '\t' << entry.contigID << '\t' << entry.min_position << '-' << entry.max_position << '\t' << entry.n_variants << '\t' << entry.uncompressed_size; + return stream; + } + + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){ + stream.write(reinterpret_cast(&entry.byte_offset), sizeof(U64)); + stream.write(reinterpret_cast(&entry.byte_offset_end), sizeof(U64)); + stream.write(reinterpret_cast(&entry.contigID), sizeof(S32)); + stream.write(reinterpret_cast(&entry.min_position), sizeof(U64)); + stream.write(reinterpret_cast(&entry.max_position), sizeof(U64)); + stream.write(reinterpret_cast(&entry.n_variants), sizeof(U32)); + stream.write(reinterpret_cast(&entry.uncompressed_size), sizeof(U32)); + return stream; + } + + friend std::istream& operator>>(std::istream& stream, self_type& entry){ + stream.read(reinterpret_cast(&entry.byte_offset), sizeof(U64)); + stream.read(reinterpret_cast(&entry.byte_offset_end), sizeof(U64)); + stream.read(reinterpret_cast(&entry.contigID), sizeof(S32)); + stream.read(reinterpret_cast(&entry.min_position), sizeof(U64)); + stream.read(reinterpret_cast(&entry.max_position), sizeof(U64)); + stream.read(reinterpret_cast(&entry.n_variants), sizeof(U32)); + stream.read(reinterpret_cast(&entry.uncompressed_size), sizeof(U32)); + + return(stream); + } + + void reset(void){ + this->byte_offset = 0; + this->byte_offset_end = 0; + this->contigID = 0; + this->min_position = 0; + this->max_position = 0; + this->n_variants = 0; + this->uncompressed_size = 0; + } + + inline const bool overlaps(const S32& contigID) const{ return(this->contigID == contigID); } + inline const bool overlaps(const S32& contigID, const U64& position) const{ + if(this->contigID != contigID) return false; + return(position >= this->min_position && position <= this->max_position); + } + inline const bool overlaps(const S32& contigID, const U64& from_position, const U64& to_position) const{ + if(this->contigID != contigID) return false; + if(from_position < this->min_position) return false; + if(to_position > this->max_position) return false; + return true; + } + +public: + U64 byte_offset; // tellg() position in stream for start of record in Tomahawk file + U64 byte_offset_end; // tellg() position in stream for start of record in Tomahawk file + S32 contigID; // contig identifier + U64 min_position; // smallest bp position in tomahawk block + U64 max_position; // largest bp position in tomahawk block + U32 n_variants; // number of variants in this block + U32 uncompressed_size; // uncompressed size of this block +}; + +} +} + +#endif /* TOTEMPOLEENTRY_H_ */ diff --git a/src/index/index_meta_container.h b/src/index/index_meta_container.h new file mode 100644 index 0000000..bad6d14 --- /dev/null +++ b/src/index/index_meta_container.h @@ -0,0 +1,187 @@ +#ifndef INDEX_INDEX_META_CONTAINER_H_ +#define INDEX_INDEX_META_CONTAINER_H_ + +#include +#include // size_t, ptrdiff_t +#include // forward_iterator_tag + +#include "../support/type_definitions.h" +#include "../io/BasicBuffer.h" +#include "index_meta_entry.h" + +namespace Tomahawk{ +namespace Totempole{ + +/**< + * STL-like container for Tomahawk meta index entries + */ +class IndexMetaContainer{ +private: + typedef IndexMetaContainer self_type; + typedef IndexMetaEntry value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef IO::BasicBuffer buffer_type; + +public: + IndexMetaContainer(void) : + n_entries_(0), + n_capacity_(1000), + entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) + { + + } + + IndexMetaContainer(const size_t n_capacity_) : + n_entries_(0), + n_capacity_(n_capacity_), + entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) + { + + } + + // Functions for when interpreting from a byte stream + // first value is the number of indices + IndexMetaContainer(const char* const data_buffer, const U32 l_data) : + n_entries_(*reinterpret_cast(data_buffer)), + n_capacity_(this->n_entries_), + entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) + { + U32 cumulative_position = sizeof(size_type); + for(U32 i = 0; i < this->size(); ++i){ + new( &this->entries_[i] ) value_type(&data_buffer[cumulative_position]); + cumulative_position += TWK_INDEX_META_ENTRY_SIZE; + } + assert(cumulative_position == l_data); + } + + ~IndexMetaContainer(){ + for(size_type i = 0; i < this->size(); ++i) + ((this->entries_ + i)->~IndexMetaEntry)(); + + ::operator delete[](static_cast(this->entries_)); + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->entries_[position]); } + inline const_reference at(const size_type& position) const{ return(this->entries_[position]); } + inline reference operator[](const size_type& position){ return(this->entries_[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->entries_[position]); } + inline pointer data(void){ return(this->entries_); } + inline const_pointer data(void) const{ return(this->entries_); } + inline reference front(void){ return(this->entries_[0]); } + inline const_reference front(void) const{ return(this->entries_[0]); } + inline reference back(void){ return(this->entries_[this->n_entries_ - 1]); } + inline const_reference back(void) const{ return(this->entries_[this->n_entries_ - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries_ == 0); } + inline const size_type& size(void) const{ return(this->n_entries_); } + inline const size_type& capacity(void) const{ return(this->n_capacity_); } + + // Iterator + inline iterator begin(){ return iterator(&this->entries_[0]); } + inline iterator end() { return iterator(&this->entries_[this->n_entries_]); } + inline const_iterator begin() const{ return const_iterator(&this->entries_[0]); } + inline const_iterator end() const{ return const_iterator(&this->entries_[this->n_entries_]); } + inline const_iterator cbegin() const{ return const_iterator(&this->entries_[0]); } + inline const_iterator cend() const{ return const_iterator(&this->entries_[this->n_entries_]); } + + // Overload basic operator + self_type& operator+=(const value_type& index_entry){ + if(this->size() + 1 >= this->capacity()){ + //std::cerr << "is full resizing" << std::endl; + this->resize(); + } + + //std::cerr << Helpers::timestamp("DEBUG") << "Adding: " << this->size() << "/" << this->capacity() << std::endl; + new( &this->entries_[this->n_entries_] ) value_type(index_entry); // invoke copy ctor + ++this->n_entries_; + return(*this); + } + + void resize(const size_t new_capacity){ + //std::cerr << Helpers::timestamp("DEBUG") << "Resize: " << this->capacity() << "->" << new_capacity << std::endl; + // if resizing to a smaller size + if(new_capacity < this->capacity()){ + // Call destructor for values between shrunk size and previous numbers + for(size_type i = new_capacity; i < this->size(); ++i) + ((this->entries_ + i)->~IndexMetaEntry)(); + + this->n_entries_ = new_capacity; + return; + } + + pointer temp = this->entries_; // Move current data pointer + this->entries_ = static_cast(::operator new[](new_capacity*sizeof(value_type))); // Allocate new memory at old pointer + // Copy data over from temporary data pointer to new pointer + for(U32 i = 0; i < this->size(); ++i) + new( &this->entries_[i] ) value_type(temp[i]); + + // Release memory from the temporary address + for(size_type i = 0; i < this->size(); ++i) + ((temp + i)->~IndexMetaEntry)(); + + ::operator delete[](static_cast(temp)); + this->n_capacity_ = new_capacity; + } + inline void resize(void){ this->resize(this->capacity()*2); } + +private: + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& container){ + stream.write(reinterpret_cast(&container.n_entries_), sizeof(size_type)); + for(size_type i = 0; i < container.size(); ++i) + stream << container[i]; + + return stream; + } + +private: + size_type n_entries_; + size_type n_capacity_; + pointer entries_; +}; + +} +} + + +#endif /* INDEX_INDEX_META_CONTAINER_H_ */ diff --git a/src/index/index_meta_entry.h b/src/index/index_meta_entry.h new file mode 100644 index 0000000..9dc53f0 --- /dev/null +++ b/src/index/index_meta_entry.h @@ -0,0 +1,97 @@ +#ifndef INDEX_INDEX_META_ENTRY_H_ +#define INDEX_INDEX_META_ENTRY_H_ + +namespace Tomahawk{ +namespace Totempole{ + +#define TWK_INDEX_META_ENTRY_SIZE (sizeof(U64)*4 + sizeof(U32)*2) + +struct IndexMetaEntry{ +public: + typedef IndexMetaEntry self_type; + +public: + IndexMetaEntry() : + index_begin(0), + index_end(0), + min_position(0), + max_position(0), + n_variants(0), + uncompressed_size(0) + { + } + + IndexMetaEntry(const char* const data) : + index_begin(*reinterpret_cast(data)), + index_end(*reinterpret_cast(&data[sizeof(U32)])), + min_position(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)])), + max_position(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)])), + n_variants(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)*2])), + uncompressed_size(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)*3])) + { + } + + // Copy ctor + IndexMetaEntry(const self_type& other) : + index_begin(other.index_begin), + index_end(other.index_end), + min_position(other.min_position), + max_position(other.max_position), + n_variants(other.n_variants), + uncompressed_size(other.uncompressed_size) + { + } + ~IndexMetaEntry() = default; + + inline U32 size(void) const{ return(this->n_variants); } + inline const bool empty(void) const{ return(this->n_variants == 0); } + inline void operator++(void){ ++this->n_variants; } + + friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ + stream << entry.index_begin << "-" << entry.index_end << '\t' << entry.min_position << '-' << entry.max_position << '\t' << entry.n_variants << '\t' << entry.uncompressed_size; + return stream; + } + + friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){ + stream.write(reinterpret_cast(&entry.index_begin), sizeof(U32)); + stream.write(reinterpret_cast(&entry.index_end), sizeof(U32)); + stream.write(reinterpret_cast(&entry.min_position), sizeof(U64)); + stream.write(reinterpret_cast(&entry.max_position), sizeof(U64)); + stream.write(reinterpret_cast(&entry.n_variants), sizeof(U64)); + stream.write(reinterpret_cast(&entry.uncompressed_size), sizeof(U64)); + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, self_type& entry){ + stream.read(reinterpret_cast(&entry.index_begin), sizeof(U32)); + stream.read(reinterpret_cast(&entry.index_end), sizeof(U32)); + stream.read(reinterpret_cast(&entry.min_position), sizeof(U64)); + stream.read(reinterpret_cast(&entry.max_position), sizeof(U64)); + stream.read(reinterpret_cast(&entry.n_variants), sizeof(U64)); + stream.read(reinterpret_cast(&entry.uncompressed_size), sizeof(U64)); + return(stream); + } + + void reset(void){ + this->index_begin = 0; + this->index_end = 0; + this->min_position = 0; + this->max_position = 0; + this->n_variants = 0; + this->uncompressed_size = 0; + } + +public: + U32 index_begin; + U32 index_end; + U64 min_position; // smallest bp position in tomahawk block + U64 max_position; // largest bp position in tomahawk block + U64 n_variants; // number of variants in this block + U64 uncompressed_size; // uncompressed size of this block +}; + +} +} + + +#endif /* INDEX_INDEX_META_ENTRY_H_ */ diff --git a/src/index/tomahawk_header.cpp b/src/index/tomahawk_header.cpp new file mode 100644 index 0000000..8b567f0 --- /dev/null +++ b/src/index/tomahawk_header.cpp @@ -0,0 +1,270 @@ +#include "tomahawk_header.h" + +namespace Tomahawk{ + +TomahawkHeader::TomahawkHeader(void) : + contigs_(nullptr), + sample_names_(nullptr), + contigs_hash_table_(nullptr), + sample_hash_table_(nullptr) +{ + +} + +// Standard dtor +TomahawkHeader::~TomahawkHeader(void){ + delete [] this->contigs_; + delete [] this->sample_names_; + delete this->contigs_hash_table_; + delete this->sample_hash_table_; +} + +// Open and close functions +int TomahawkHeader::open(std::istream& stream){ + if(stream.good() == false){ + std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl; + return(-1); + } + + stream >> this->magic_; + if(this->validate() == false){ + std::cerr << Helpers::timestamp("ERROR") << "Failed to validate MAGIC header!" << std::endl; + return(-2); + } + + if(stream.good() == false){ + std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl; + return(-1); + } + + // Parse literal block + compressor_type tgzf_controller(this->magic_.l_header_uncompressed + 1024); + buffer_type buffer(this->magic_.l_header + 1024); + buffer_type buffer_uncompressed(this->magic_.l_header_uncompressed + 1024); + stream.read(buffer.data(), this->magic_.l_header); + buffer.n_chars = this->magic_.l_header; + + if(stream.good() == false){ + std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl; + return(-1); + } + + if(!tgzf_controller.Inflate(buffer, buffer_uncompressed)){ + std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to get deflate literal TGZF DATA!" << std::endl; + return(-3); + } + + // Parse contigs + // Parse names + // Construct hash tables + U32 buffer_position = 0; + + // Parse contigs + this->contigs_ = new contig_type[this->magic_.getNumberContigs()]; + for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){ + buffer_position += this->contigs_[i].interpret(&buffer_uncompressed[buffer_position]); + assert(buffer_position < buffer_uncompressed.size()); + } + + // Parse sample names + // Encoded as |length in characters|character buffer| + this->sample_names_ = new std::string[this->magic_.getNumberSamples()]; + for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){ + const U32 length = *reinterpret_cast(&buffer_uncompressed[buffer_position]); + buffer_position += sizeof(U32); + + this->sample_names_[i] = std::string(&buffer_uncompressed[buffer_position], length); + buffer_position += length; + assert(buffer_position < buffer_uncompressed.size()); + } + + // Remainder is literal data + const U32 l_literals = buffer_uncompressed.size() - buffer_position; + this->literals_ = std::string(&buffer_uncompressed[buffer_position], l_literals); + + // Build hash tables for contigs and sample names + if(this->BuildHashTables() == false){ + std::cerr << Helpers::timestamp("ERROR") << "Cannot build hash tables" << std::endl; + return(-4); + } + + // Buffer cleanup + buffer.deleteAll(); + buffer_uncompressed.deleteAll(); + + return(1); +} + +int TomahawkHeader::write(std::ostream& stream){ + if(stream.good() == false){ + std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl; + return(-1); + } + + // Compute uncompressed size + const U32 l_uncompressed_size = this->DetermineUncompressedSize(); + + buffer_type buffer(l_uncompressed_size + 1024); + for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){ + //std::cerr << Helpers::timestamp("DEBUG") << this->contigs_[i] << std::endl; + buffer += this->contigs_[i]; + } + + for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){ + buffer += (U32)this->sample_names_[i].size(); + //std::cerr << Helpers::timestamp("DEBUG") << this->sample_names_[i] << std::endl; + buffer.Add(this->sample_names_[i].data(), this->sample_names_[i].size()); + } + + buffer.Add(this->literals_.data(), this->literals_.size()); + this->magic_.l_header_uncompressed = buffer.size(); + //std::cerr << buffer.size() << "\t" << l_uncompressed_size << std::endl; + assert(this->magic_.l_header_uncompressed == l_uncompressed_size); + + + compressor_type tgzf_controller(this->magic_.l_header_uncompressed + 1024); + if(!tgzf_controller.Deflate(buffer)){ + std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to get deflate literal TGZF DATA!" << std::endl; + return(-3); + } + + // Store compressed size + this->magic_.l_header = tgzf_controller.buffer.size(); + + stream << this->magic_; + if(stream.good() == false){ + std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl; + return(-1); + } + stream.write(tgzf_controller.buffer.data(), tgzf_controller.buffer.size()); + + //std::cerr << Helpers::timestamp("DEBUG") << this->magic_.l_header << "->" << this->magic_.l_header_uncompressed << '\t' << buffer.size() << "/" << buffer.capacity() << std::endl; + + // Cleanup buffer + buffer.deleteAll(); + + return(1); +} + +const bool TomahawkHeader::getSample(const std::string& sample_name, const std::string*& return_target) const{ + if(this->sample_hash_table_ == nullptr) + return false; + + if(sample_name.size() == 0) + return false; + + if(this->sample_hash_table_->occupied() == 0) + return false; + + S32* target = nullptr; + if(this->sample_hash_table_->GetItem(&sample_name[0], &sample_name, target, sample_name.length())){ + return_target = &this->sample_names_[*target]; + return true; + } + return false; +} + +const bool TomahawkHeader::getContigName(const std::string& contig_name, const std::string*& return_target) const{ + if(this->contigs_hash_table_ == nullptr) + return false; + + if(contig_name.size() == 0) + return false; + + if(this->contigs_hash_table_->occupied() == 0) + return false; + + S32* target = nullptr; + if(this->contigs_hash_table_->GetItem(&contig_name[0], &contig_name, target, contig_name.length())){ + return_target = &this->contigs_[*target].name; + return true; + } + return false; +} + +const bool TomahawkHeader::getContig(const std::string& contig_name, const contig_type*& return_target) const{ + if(this->contigs_hash_table_ == nullptr) + return false; + + if(contig_name.size() == 0) + return false; + + if(this->contigs_hash_table_->occupied() == 0) + return false; + + S32* target = nullptr; + if(this->contigs_hash_table_->GetItem(&contig_name[0], &contig_name, target, contig_name.length())){ + return_target = &this->contigs_[*target]; + return true; + } + return false; +} + +const S32 TomahawkHeader::getContigID(const std::string& contig_name) const{ + if(this->contigs_hash_table_ == nullptr) + return false; + + if(contig_name.size() == 0) + return false; + + if(this->contigs_hash_table_->occupied() == 0) + return false; + + S32* target = nullptr; + if(this->contigs_hash_table_->GetItem(&contig_name[0], &contig_name, target, contig_name.length())){ + return(*target); + } + return(-1); +} + +bool TomahawkHeader::BuildHashTables(void){ + // For contigs + if(this->magic_.getNumberContigs() * 2 < 1024) + this->contigs_hash_table_ = new hash_table(1024); + else + this->contigs_hash_table_ = new hash_table(this->magic_.getNumberContigs() * 2); + + S32* retValue = 0; + for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){ + if(this->contigs_hash_table_->GetItem(&this->contigs_[i].name[0], &this->contigs_[i].name, retValue, this->contigs_[i].name.size())){ + std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Duplicated contig! Impossible!" << std::endl; + return false; + } + this->contigs_hash_table_->SetItem(&this->contigs_[i].name[0], &this->contigs_[i].name, i, this->contigs_[i].name.size()); + } + + // For sample names + if(this->magic_.getNumberSamples() * 2 < 1024) + this->sample_hash_table_ = new hash_table(1024); + else + this->sample_hash_table_ = new hash_table(this->magic_.getNumberSamples() * 2); + + retValue = 0; + for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){ + if(this->sample_hash_table_->GetItem(&this->sample_names_[i][0], &this->sample_names_[i], retValue, this->sample_names_[i].size())){ + std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Duplicated name! Impossible!" << std::endl; + return false; + } + this->sample_hash_table_->SetItem(&this->sample_names_[i][0], &this->sample_names_[i], i, this->sample_names_[i].size()); + } + + return true; +} + +const U32 TomahawkHeader::DetermineUncompressedSize(void) const{ + U32 l_uncompressed = 0; + for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){ + l_uncompressed += this->contigs_[i].name.size() + sizeof(U32)*2; + } + + for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){ + l_uncompressed += this->sample_names_[i].size() + sizeof(U32); + } + + l_uncompressed += this->literals_.size(); + + return(l_uncompressed); +} + + +} diff --git a/src/index/tomahawk_header.h b/src/index/tomahawk_header.h new file mode 100644 index 0000000..2ab2f17 --- /dev/null +++ b/src/index/tomahawk_header.h @@ -0,0 +1,70 @@ +#ifndef INDEX_TOMAHAWK_HEADER_H_ +#define INDEX_TOMAHAWK_HEADER_H_ + +#include + +#include "../algorithm/open_hashtable.h" +#include "index_contig.h" +#include "../io/BasicBuffer.h" +#include "../tomahawk/tomahawk_magic_header.h" +#include "../io/compression/TGZFController.h" + +namespace Tomahawk{ + +/**< + * This container handles the header data for + * a `twk` file + */ +class TomahawkHeader{ +public: + typedef TomahawkHeader self_type; + typedef Totempole::HeaderContig contig_type; + typedef Base::TomahawkMagicHeader magic_type; + typedef IO::BasicBuffer buffer_type; + typedef Hash::HashTable hash_table; + typedef IO::TGZFController compressor_type; + +public: + TomahawkHeader(void); + + // Standard dtor + ~TomahawkHeader(void); + + // Open and close functions + int open(std::istream& stream = std::cin); + int write(std::ostream& stream = std::cout); + + // Accessors + inline std::string& getLiterals(void){ return(this->literals_); } + inline const std::string& getLiterals(void) const{ return(this->literals_); } + inline std::string& getSample(const U32 position){ return(this->sample_names_[position]); } + inline const std::string& getSample(const U32 position) const{ return(this->sample_names_[position]); } + + const bool getSample(const std::string& sample_name, const std::string*& return_target) const; + const bool getContigName(const std::string& contig_name, const std::string*& return_target) const; + const bool getContig(const std::string& contig_name, const contig_type*& return_target) const; + const S32 getContigID(const std::string& contig_name) const; + + inline magic_type& getMagic(void){ return(this->magic_); } + inline const magic_type& getMagic(void) const{ return(this->magic_); } + + // Updater + inline void addLiteral(const std::string& string){ this->literals_ += string; } + inline const bool validate(void) const{ return(this->magic_.validate()); } + +private: + bool BuildHashTables(void); + const U32 DetermineUncompressedSize(void) const; + +public: + magic_type magic_; // magic header + std::string literals_; // literal data + contig_type* contigs_; // contig data + std::string* sample_names_; // sample names + hash_table* contigs_hash_table_; // contig name hash table + hash_table* sample_hash_table_; // sample name hash table +}; + +} + +#endif /* INDEX_TOMAHAWK_HEADER_H_ */ diff --git a/src/interface/ProgressBar.h b/src/interface/progressbar.h similarity index 99% rename from src/interface/ProgressBar.h rename to src/interface/progressbar.h index d52e066..85dde5e 100644 --- a/src/interface/ProgressBar.h +++ b/src/interface/progressbar.h @@ -9,7 +9,7 @@ #include #include "../support/helpers.h" -#include "Timer.h" +#include "timer.h" namespace Tomahawk{ namespace Interface{ diff --git a/src/interface/Timer.h b/src/interface/timer.h similarity index 100% rename from src/interface/Timer.h rename to src/interface/timer.h diff --git a/src/io/BasicBuffer.cpp b/src/io/BasicBuffer.cpp deleted file mode 100644 index d39c17c..0000000 --- a/src/io/BasicBuffer.cpp +++ /dev/null @@ -1,14 +0,0 @@ -/* - * BasicBuffer.cpp - * - * Created on: 20 Feb 2017 - * Author: mk21 - */ - -#include "BasicBuffer.h" - -namespace Tomahawk { -namespace IO{ - -} -} /* namespace Tomahawk */ diff --git a/src/io/BasicBuffer.h b/src/io/BasicBuffer.h index 8281acd..516b6cb 100644 --- a/src/io/BasicBuffer.h +++ b/src/io/BasicBuffer.h @@ -3,54 +3,105 @@ #include #include -#include "../support/TypeDefinitions.h" +#include "../support/type_definitions.h" #include "../support/helpers.h" namespace Tomahawk { namespace IO{ struct BasicBuffer{ - typedef BasicBuffer self_type; - - BasicBuffer() : pointer(0), width(0), data(nullptr){} - BasicBuffer(const U64 size) : pointer(0), width(size), data(new char[size]){} - BasicBuffer(char* target, const size_t length) : pointer(length), width(length), data(target){} - BasicBuffer(const U64 size, char* target) : pointer(0), width(size), data(target){} - BasicBuffer(const self_type& other) : pointer(0), width(other.width), data(new char[other.width]){} +private: + typedef BasicBuffer self_type; + typedef char value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + +public: + BasicBuffer(void) : n_chars(0), width(0), buffer(nullptr){} + BasicBuffer(const U64 size) : n_chars(0), width(size), buffer(new value_type[size]){} + BasicBuffer(pointer target, const size_t length) : n_chars(length), width(length), buffer(target){} + BasicBuffer(const U64 size, pointer target) : n_chars(0), width(size), buffer(target){} + BasicBuffer(const self_type& other) : n_chars(0), width(other.width), buffer(new value_type[other.width]){} virtual ~BasicBuffer(){} + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Iterator + inline iterator begin(){ return iterator(&this->buffer[0]); } + inline iterator end() { return iterator(&this->buffer[this->n_chars - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->buffer[0]); } + inline const_iterator end() const{ return const_iterator(&this->buffer[this->n_chars - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->buffer[0]); } + inline const_iterator cend() const{ return const_iterator(&this->buffer[this->n_chars - 1]); } + inline void set(const size_t size){ - this->pointer = 0; + this->n_chars = 0; this->width = size; - if(this->data != nullptr) - delete [] this->data; + if(this->buffer != nullptr) + delete [] this->buffer; - this->data = new char[size]; + this->buffer = new char[size]; } - inline void deleteAll(void){ delete [] this->data; } // manual cleaup + inline void deleteAll(void){ delete [] this->buffer; } // manual cleaup inline void set(const size_t size, char* target){ - this->pointer = 0; + this->n_chars = 0; this->width = size; - this->data = target; + this->buffer = target; } inline virtual void set(char* target){ - this->pointer = 0; + this->n_chars = 0; this->width = 0; - this->data = target; + this->buffer = target; } - inline void reset(){ this->pointer = 0; } - inline void move(const U64 to){ this->pointer = to; } - inline const U64& size(void) const{ return this->pointer; } + inline void reset(){ this->n_chars = 0; } + inline void move(const U64 to){ this->n_chars = to; } + inline const U64& size(void) const{ return this->n_chars; } inline const U64& capacity(void) const{ return this->width; } void resize(const U64 new_size){ if(new_size <= this->capacity()){ if(new_size < this->size()) - this->pointer = new_size; + this->n_chars = new_size; return; } @@ -58,13 +109,13 @@ struct BasicBuffer{ U64 copy_to = this->size(); if(new_size < this->size()){ copy_to = new_size; - this->pointer = copy_to; + this->n_chars = copy_to; } - //std::cerr << Helpers::timestamp("DEBUG") << "Resizing buffer: " << this->capacity() << " -> " << new_size << "\tcopyto: " << copy_to << std::endl; - char* target = this->data; - this->data = new char[new_size]; - memcpy(&this->data[0], &target[0], copy_to); + //std::cerr << utility::timestamp("DEBUG") << "Resizing buffer: " << this->capacity() << " -> " << new_size << "\tcopyto: " << copy_to << std::endl; + char* target = this->buffer; + this->buffer = new char[new_size]; + memcpy(&this->buffer[0], &target[0], copy_to); delete [] target; this->width = new_size; } @@ -79,137 +130,189 @@ struct BasicBuffer{ if(this->size() + length >= this->capacity()) this->resize((this->size() + length) * 2); - memcpy(&this->data[this->pointer], &data[0], length); - this->pointer += length; + memcpy(&this->buffer[this->n_chars], &data[0], length); + this->n_chars += length; + } + + void AddReadble(const SBYTE& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%d", value); + this->n_chars += ret; + } + + void AddReadble(const S16& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%d", value); + this->n_chars += ret; + } + + void AddReadble(const S32& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%d", value); + this->n_chars += ret; + } + + void AddReadble(const BYTE& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%u", value); + this->n_chars += ret; + } + + void AddReadble(const U16& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%u", value); + this->n_chars += ret; + } + + void AddReadble(const U32& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%u", value); + this->n_chars += ret; + } + + void AddReadble(const U64& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%llu", value); + this->n_chars += ret; + } + + void AddReadble(const float& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%g", value); + this->n_chars += ret; + } + + void AddReadble(const double& value){ + const int ret = sprintf(&this->buffer[this->n_chars], "%g", value); + this->n_chars += ret; } inline self_type& operator+=(const self_type& other){ if(this->size() + other.size() >= this->capacity()) this->resize((this->size() + other.size()) * 2); - memcpy(&this->data[this->pointer], other.data, other.pointer); - this->pointer += other.pointer; + memcpy(&this->buffer[this->n_chars], other.buffer, other.n_chars); + this->n_chars += other.n_chars; return *this; } inline self_type& operator+=(const char& value){ - if(this->pointer + sizeof(char) >= this->width) + if(this->n_chars + sizeof(char) >= this->width) this->resize(this->width*2); - this->data[this->pointer] = value; - ++this->pointer; + this->buffer[this->n_chars] = value; + ++this->n_chars; return *this; } inline self_type& operator+=(const BYTE& value){ - if(this->pointer + sizeof(BYTE) >= this->width) + if(this->n_chars + sizeof(BYTE) >= this->width) this->resize(this->width*2); - BYTE* p = reinterpret_cast(&this->data[this->pointer]); + BYTE* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(BYTE); + this->n_chars += sizeof(BYTE); return *this; } inline self_type& operator+=(const float& value){ - if(this->pointer + sizeof(float) >= this->width) + if(this->n_chars + sizeof(float) >= this->width) this->resize(this->width*2); - float* p = reinterpret_cast(&this->data[this->pointer]); + float* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(float); + this->n_chars += sizeof(float); return *this; } inline self_type& operator+=(const U16 value){ - if(this->pointer + sizeof(U16) >= this->width) + if(this->n_chars + sizeof(U16) >= this->width) this->resize(this->width*2); - U16* p = reinterpret_cast(&this->data[this->pointer]); + U16* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(U16); + this->n_chars += sizeof(U16); return *this; } inline self_type& operator+=(const short& value){ - if(this->pointer + sizeof(short) >= this->width) + if(this->n_chars + sizeof(short) >= this->width) this->resize(this->width*2); - short* p = reinterpret_cast(&this->data[this->pointer]); + short* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(short); + this->n_chars += sizeof(short); return *this; } inline self_type& operator+=(const U32& value){ - if(this->pointer + sizeof(U32) >= this->width) + if(this->n_chars + sizeof(U32) >= this->width) this->resize(this->width*2); - U32* p = reinterpret_cast(&this->data[this->pointer]); + U32* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(U32); + this->n_chars += sizeof(U32); return *this; } inline self_type& operator+=(const S32& value){ - if(this->pointer + sizeof(S32) >= this->width) + if(this->n_chars + sizeof(S32) >= this->width) this->resize(this->width*2); - S32* p = reinterpret_cast(&this->data[this->pointer]); + S32* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(S32); + this->n_chars += sizeof(S32); return *this; } inline self_type& operator+=(const double& value){ - if(this->pointer + sizeof(double) >= this->width) + if(this->n_chars + sizeof(double) >= this->width) this->resize(this->width*2); - double* p = reinterpret_cast(&this->data[this->pointer]); + double* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(double); + this->n_chars += sizeof(double); return *this; } inline self_type& operator+=(const U64& value){ - if(this->pointer + sizeof(U64) >= this->width) + if(this->n_chars + sizeof(U64) >= this->width) this->resize(this->width*2); - U64* p = reinterpret_cast(&this->data[this->pointer]); + U64* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; - this->pointer += sizeof(U64); + this->n_chars += sizeof(U64); return *this; } inline self_type& operator+=(const std::string& value){ - if(this->pointer + value.size() + sizeof(BYTE) >= this->width){ + if(this->n_chars + value.size() + sizeof(BYTE) >= this->width){ U64 resize_to = this->width * 2; - while(this->pointer + value.size() + sizeof(BYTE) >= resize_to) + while(this->n_chars + value.size() + sizeof(BYTE) >= resize_to) resize_to *= 2; this->resize(resize_to); } for(U32 i = 0; i < value.size(); ++i){ - this->data[this->pointer] = value[i]; - ++this->pointer; + this->buffer[this->n_chars] = value[i]; + ++this->n_chars; } return *this; } - char& operator[](const U64 size){ return this->data[size]; } - const char& operator[](const U64 size) const{ return this->data[size]; } + inline reference operator[](const U64 position){ return this->buffer[position]; } + inline const_reference operator[](const U64 position) const{ return this->buffer[position]; } + inline reference at(const U64 position){ return this->buffer[position]; } + inline const_reference at(const U64 position) const{ return this->buffer[position]; } + inline pointer data(void){ return(this->buffer); } + inline const_pointer data(void) const{ return(this->buffer); } + +private: friend std::ostream& operator<<(std::ostream& out, const self_type& data){ - out.write(data.data, data.pointer); + out.write(data.buffer, data.n_chars); return(out); } - U64 pointer; - U64 width; - char* data; +public: + U64 n_chars; + U64 width; + pointer buffer; }; } /* namespace IO */ diff --git a/src/io/BasicWriters.h b/src/io/BasicWriters.h index a3c8024..9cb735b 100644 --- a/src/io/BasicWriters.h +++ b/src/io/BasicWriters.h @@ -15,7 +15,7 @@ namespace IO{ class GenericWriterInterace { protected: - typedef IO::BasicBuffer buffer_type; + typedef IO::BasicBuffer buffer_type; typedef Algorithm::SpinLock lock_type; public: @@ -39,8 +39,8 @@ class GenericWriterInterace { virtual bool close(void) =0; inline lock_type* getLock(void){ return(&this->lock); } - virtual inline const size_t writeNoLock(const char* data, const U32 length) =0; - virtual inline const size_t writeNoLock(const buffer_type& buffer) =0; + virtual const size_t writeNoLock(const char* data, const U32 length) =0; + virtual const size_t writeNoLock(const buffer_type& buffer) =0; protected: lock_type lock; @@ -76,8 +76,8 @@ class WriterStandardOut : public GenericWriterInterace{ } inline const size_t writeNoLock(const buffer_type& buffer){ - std::cout.write(&buffer.data[0], buffer.pointer); - return(buffer.pointer); + std::cout.write(buffer.data(), buffer.size()); + return(buffer.size()); } void operator<<(void* entry){} @@ -87,7 +87,7 @@ class WriterStandardOut : public GenericWriterInterace{ // Note that this threads enter here at random // Extremely unlikely there is every any contention this->lock.lock(); - std::cout.write(&buffer.data[0], buffer.size()); + std::cout.write(buffer.data(), buffer.size()); this->lock.unlock(); } }; @@ -108,12 +108,15 @@ class WriterFile : public GenericWriterInterace{ } bool open(const std::string output){ + std::cerr << "here in open: " << output << std::endl; if(output.length() == 0){ std::cerr << Helpers::timestamp("ERROR", "WRITER") << "No output name provided..." << std::endl; return false; } + std::cerr << "after test" << std::endl; this->stream.open(output, std::ios::binary | std::ios::out); + std::cerr << "after first open" << std::endl; if(!this->stream.good()){ std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not open output file: " << output << "..." << std::endl; return false; @@ -122,6 +125,8 @@ class WriterFile : public GenericWriterInterace{ if(!SILENT) std::cerr << Helpers::timestamp("LOG", "WRITER") << "Opening output file: " << output << "..." << std::endl; + std::cerr << "returnign open" << std::endl; + std::cerr << this->stream.good() << std::endl; return true; } @@ -130,16 +135,22 @@ class WriterFile : public GenericWriterInterace{ inline void flush(void){ this->stream.flush(); } inline bool close(void){ this->stream.close(); return true; } + template + void operator<<(const Y& value){ + this->stream << value; + } + void operator<<(const buffer_type& buffer){ // Mutex lock; write; unlock // Note that this threads enter here at random // Extremely unlikely there is every any contention this->lock.lock(); - this->stream.write(&buffer.data[0], buffer.size()); + this->stream.write(buffer.data(), buffer.size()); this->lock.unlock(); } void operator<<(void* entry){} + const size_t write(const char* data, const U64& length){ this->lock.lock(); this->stream.write(&data[0], length); @@ -153,12 +164,12 @@ class WriterFile : public GenericWriterInterace{ } inline const size_t writeNoLock(const buffer_type& buffer){ - this->stream.write(&buffer.data[0], buffer.pointer); - return(buffer.pointer); + this->stream.write(buffer.data(), buffer.size()); + return(buffer.size()); } private: - std::string outFile; + std::string outFile; std::ofstream stream; }; diff --git a/src/io/PackedEntryReader.h b/src/io/PackedEntryReader.h deleted file mode 100644 index e3c7bd5..0000000 --- a/src/io/PackedEntryReader.h +++ /dev/null @@ -1,168 +0,0 @@ -#ifndef PACKEDENTRYREADER_H_ -#define PACKEDENTRYREADER_H_ - -namespace Tomahawk{ -namespace IO{ - -#define PACKED_READER_DEFAULT_CHUNK 1000000 // 1MB - -template -class PackedEntryReader{ - typedef TomahawkOutputEntry entry_type; - -public: - PackedEntryReader(); - virtual ~PackedEntryReader(); - - bool setup(const std::string file, size_t chunk_size = PACKED_READER_DEFAULT_CHUNK - PACKED_READER_DEFAULT_CHUNK % Y); - bool nextEntry(const entry_type*& entry); - virtual bool nextBlock(void); - - inline bool seek(const size_t pos); - inline entry_type* begin(void){ return(this->entries); } - inline entry_type* end(void){ return(&this->entries[this->entry_tail-1]); } - inline entry_type* operator[](const U32& p){ return(&this->entries[p]); } - inline const size_t& size(void) const{ return this->entry_tail; } - inline const size_t& size_buffer(void) const{ return this->buffer_size; } - inline void reset(void){ this->entry_head = 0; this->entry_tail = 0; } - inline void next(void){ ++this->entry_head; } - inline void prev(void){ --this->entry_head; } - inline bool available(void) const{ return(this->entry_head < this->entry_tail); } - inline bool good(void) const{ return(this->stream.good()); } - inline const size_t& filesize(void) const{ return this->__filesize; } - inline size_t tellg(void){ return this->stream.tellg(); } - inline const size_t& block_size(void) const{ return this->read_block_size; } - -protected: - bool open(const std::string& file); - -protected: - size_t __filesize; - size_t entry_head; - size_t entry_tail; - size_t buffer_size; - size_t read_block_size; - std::ifstream stream; - char* buffer; - entry_type* entries; -}; - -template -PackedEntryReader::PackedEntryReader() - : __filesize(0) - , entry_head(0) - , entry_tail(0) - , buffer_size(0) - , read_block_size(0) - , buffer(nullptr) - , entries(nullptr) -{} - -template -PackedEntryReader::~PackedEntryReader(){ - delete [] this->buffer; -} - -template -bool PackedEntryReader::setup(const std::string file, size_t chunk_size){ - if(!this->open(file)){ - std::cerr << Helpers::timestamp("ERROR", "IO") << "Failed to open file..." << std::endl; - return false; - } - - if(chunk_size == 0){ - std::cerr << Helpers::timestamp("ERROR", "IO") << "Illegal chunk size" << std::endl; - return false; - } - - if(chunk_size % Y != 0){ - std::cerr << "Adjusting chunk size: " << chunk_size << " -> "; - chunk_size -= chunk_size % Y; - std::cerr << chunk_size << std::endl; - if(chunk_size == 0){ - std::cerr << "illegal chunk size" << std::endl; - return false; - } - } - - this->read_block_size = chunk_size; - - this->reset(); - delete [] this->buffer; - this->buffer = new char[this->read_block_size]; - this->buffer_size = this->read_block_size; - - return true; -} - -template -bool PackedEntryReader::open(const std::string& file){ - this->stream.open(file, std::ios::binary | std::ios::in | std::ios::ate); - if(!this->good()){ - std::cerr << Helpers::timestamp("ERROR", "IO") << "IO-stream is bad..." << std::endl; - return false; - } - - this->__filesize = this->stream.tellg(); - this->stream.seekg(0); - - return true; -} - -template -bool PackedEntryReader::nextEntry(const entry_type*& entry){ - if(!this->available()){ - if(!this->nextBlock()) - return false; - } - - entry = &this->entries[this->entry_head]; - this->next(); - return true; -} - -template -bool PackedEntryReader::nextBlock(void){ - if(!this->good()){ - std::cerr << Helpers::timestamp("ERROR", "IO") << "IO-stream has failed..." << std::endl; - return false; - } - - if(this->stream.tellg() == this->filesize()) - return false; - - // Ignore if unset - // comparison does not happen if tellg() == -1, return above - this->reset(); - - size_t readAmount = this->read_block_size; - if((U64)this->stream.tellg() + this->read_block_size > this->filesize()) - readAmount = this->filesize() - this->stream.tellg(); - - this->stream.read(this->buffer, readAmount); - const U32 entries_read = this->stream.gcount() / Y; - this->buffer_size = this->stream.gcount(); - if(this->stream.gcount() % Y != 0){ - std::cerr << Helpers::timestamp("ERROR", "IO") << "block is staggered" << std::endl; - return false; - } - - this->entry_tail = entries_read; - this->entries = reinterpret_cast(this->buffer); - - return true; -} - -template -bool PackedEntryReader::seek(const size_t pos){ - this->stream.seekg(pos); - this->reset(); // trigger reloading data when asking for next entry - return(this->good()); -} - -} -} - - - -#endif /* PACKEDENTRYREADER_H_ */ diff --git a/src/io/bcf/BCFEntry.cpp b/src/io/bcf/BCFEntry.cpp index 7d953fa..537f3c9 100644 --- a/src/io/bcf/BCFEntry.cpp +++ b/src/io/bcf/BCFEntry.cpp @@ -10,7 +10,7 @@ namespace Tomahawk { namespace BCF { BCFEntry::BCFEntry(void): - pointer(0), + l_data(0), limit(262144), l_ID(0), p_genotypes(0), @@ -30,7 +30,7 @@ BCFEntry::~BCFEntry(void){ delete [] this->data; } void BCFEntry::resize(const U32 size){ char* temp = this->data; this->data = new char[size]; - memcpy(this->data, temp, this->pointer); + memcpy(this->data, temp, this->l_data); std::swap(temp, this->data); delete [] temp; this->body = reinterpret_cast(this->data); @@ -40,11 +40,11 @@ void BCFEntry::resize(const U32 size){ } void BCFEntry::add(const char* const data, const U32 length){ - if(this->pointer + length > this-> capacity()) - this->resize(this->pointer + length + 65536); + if(this->l_data + length > this-> capacity()) + this->resize(this->l_data + length + 65536); - memcpy(&this->data[this->pointer], data, length); - this->pointer += length; + memcpy(&this->data[this->l_data], data, length); + this->l_data += length; } void BCFEntry::__parseID(U32& internal_pos){ @@ -126,25 +126,14 @@ bool BCFEntry::parse(void){ // Format key const base_type& fmt_type = *reinterpret_cast(&this->data[internal_pos++]); - //std::cerr << "fmt_key:" << (int)fmt_key_value << '\t' << "fmt_type: " << (int)fmt_type.high << '\t' << (int)fmt_type.low << std::endl; - //std::cerr << (int)fmt_type_value2 << '\t' << (int)fmt_type_value1 << std::endl; - //assert(fmt_type.high == 2); if(fmt_type.high != 2){ this->isGood = false; return false; } - this->isGood = true; - - /* - for(U32 i = 0; i < 44; ++i){ - const SBYTE& fmt_type_value1 = *reinterpret_cast(&this->data[internal_pos++]); - const SBYTE& fmt_type_value2 = *reinterpret_cast(&this->data[internal_pos++]); - std::cerr << i << ':' << " " << (int)fmt_type_value1 << ',' << (int)fmt_type_value2 << '\t' << (int)(BCF::BCF_UNPACK_GENOTYPE(fmt_type_value1)) << ',' << (int)(BCF::BCF_UNPACK_GENOTYPE(fmt_type_value2)) << std::endl; - } - */ - this->genotypes = &this->data[internal_pos]; + this->isGood = true; + this->genotypes = &this->data[internal_pos]; this->p_genotypes = internal_pos; return true; diff --git a/src/io/bcf/BCFEntry.h b/src/io/bcf/BCFEntry.h index 996f37f..9add53f 100644 --- a/src/io/bcf/BCFEntry.h +++ b/src/io/bcf/BCFEntry.h @@ -10,28 +10,24 @@ const BYTE BCF_UNPACK_TOMAHAWK[3] = {2, 0, 1}; #define BCF_UNPACK_GENOTYPE(A) BCF_UNPACK_TOMAHAWK[(A >> 1)] -#pragma pack(1) -struct BCFAtomicBase{ +#pragma pack(push, 1) +struct __attribute__((packed, aligned(1))) BCFAtomicBase{ BYTE low: 4, high: 4; }; -#pragma pack(1) -struct BCFAtomicSBYTE{ +struct __attribute__((packed, aligned(1))) BCFAtomicSBYTE{ SBYTE low: 4, high: 4; }; -#pragma pack(1) -struct BCFAtomicS16{ +struct __attribute__((packed, aligned(1))) BCFAtomicS16{ S16 low: 4, high: 12; }; -#pragma pack(1) -struct BCFAtomicS32{ +struct __attribute__((packed, aligned(1))) BCFAtomicS32{ S32 low: 4, high: 28; }; -#pragma pack(1) -struct BCFEntryBody{ +struct __attribute__((packed, aligned(1))) BCFEntryBody{ typedef BCFEntryBody self_type; BCFEntryBody(); // disallow ctor and dtor @@ -62,6 +58,8 @@ struct BCFEntryBody{ U32 n_sample: 8, n_fmt: 24; }; +#pragma pack(pop) + struct BCFTypeString{ typedef BCFAtomicBase base_type; @@ -70,23 +68,34 @@ struct BCFTypeString{ }; struct BCFEntry{ +public: + typedef BCFEntry self_type; typedef IO::BasicBuffer buffer_type; - typedef BCFEntryBody body_type; - typedef BCFTypeString string_type; - typedef BCFAtomicBase base_type; + typedef BCFEntryBody body_type; + typedef BCFTypeString string_type; + typedef BCFAtomicBase base_type; +public: BCFEntry(void); ~BCFEntry(void); void resize(const U32 size); void add(const char* const data, const U32 length); - inline void reset(void){ this->pointer = 0; this->isGood = false; } - inline const U32& size(void) const{ return(this->pointer); } + inline void reset(void){ this->l_data = 0; this->isGood = false; } + inline const U32& size(void) const{ return(this->l_data); } inline const U32& capacity(void) const{ return(this->limit); } inline U64 sizeBody(void) const{ return(this->body->l_shared + this->body->l_indiv); } + /**< + * Support function: + * Checks that there is exactly two alleles and that both the + * ref and alt allele are of length one (i.e. is a simple SNV->SNV) + * @return Returns TRUE if fulfilling these critera or FALSE otherwise + */ inline const bool isSimple(void) const{ - return((this->body->n_allele == 2) && (this->alleles[0].length == 1 && this->alleles[1].length == 1)); + return((this->body->n_allele == 2) && + (this->alleles[0].length == 1 && + this->alleles[1].length == 1)); } void __parseID(U32& internal_pos); @@ -98,12 +107,12 @@ struct BCFEntry{ const bool& good(void) const{ return(this->isGood); } public: - U32 pointer; // byte width - U32 limit; // capacity - U32 l_ID; - U32 p_genotypes; // position genotype data begin - BYTE ref_alt; // parsed - bool isGood; + U32 l_data; // byte width + U32 limit; // capacity + U32 l_ID; + U32 p_genotypes; // position genotype data begin + BYTE ref_alt; // parsed + bool isGood; char* data; // hard copy data to buffer, interpret internally body_type* body; // BCF2 body string_type* alleles; // pointer to pointer of ref alleles and their lengths diff --git a/src/io/bcf/BCFReader.cpp b/src/io/bcf/BCFReader.cpp index ec9a978..984effe 100644 --- a/src/io/bcf/BCFReader.cpp +++ b/src/io/bcf/BCFReader.cpp @@ -85,7 +85,7 @@ bool BCFReader::parseHeader(void){ return false; } - if(strncmp(&this->bgzf_controller.buffer.data[0], "BCF\2\2", 5) != 0){ + if(strncmp(this->bgzf_controller.buffer.data(), "BCF\2\2", 5) != 0){ std::cerr << Tomahawk::Helpers::timestamp("ERROR","BCF") << "Failed to validate MAGIC" << std::endl; return false; } diff --git a/src/io/bcf/BCFReader.h b/src/io/bcf/BCFReader.h index 32ff238..481bce0 100644 --- a/src/io/bcf/BCFReader.h +++ b/src/io/bcf/BCFReader.h @@ -8,18 +8,19 @@ #include "../BasicBuffer.h" #include "../compression/BGZFController.h" #include "BCFEntry.h" +#include "../vcf/VCFHeader.h" namespace Tomahawk { namespace BCF { class BCFReader{ - typedef BCFReader self_type; - typedef IO::BasicBuffer buffer_type; - typedef IO::BGZFController bgzf_controller_type; - typedef IO::BGZFHeader bgzf_type; - typedef VCF::VCFHeader header_type; + typedef BCFReader self_type; + typedef IO::BasicBuffer buffer_type; + typedef IO::BGZFController bgzf_controller_type; + typedef IO::BGZFHeader bgzf_type; + typedef VCF::VCFHeader header_type; typedef VCF::VCFHeaderContig contig_type; - typedef BCFEntry entry_type; + typedef BCFEntry entry_type; public: BCFReader(); @@ -33,13 +34,13 @@ class BCFReader{ bool open(const std::string input); public: - std::ifstream stream; - U64 filesize; - U32 current_pointer; - buffer_type buffer; - buffer_type header_buffer; + std::ifstream stream; + U64 filesize; + U32 current_pointer; + buffer_type buffer; + buffer_type header_buffer; bgzf_controller_type bgzf_controller; - header_type header; + header_type header; }; } diff --git a/src/io/compression/BGZFController.cpp b/src/io/compression/BGZFController.cpp index e1d0b0c..b33a3bf 100644 --- a/src/io/compression/BGZFController.cpp +++ b/src/io/compression/BGZFController.cpp @@ -19,7 +19,7 @@ BGZFController::~BGZFController(){ this->buffer.deleteAll(); } void BGZFController::Clear(){ this->buffer.reset(); } U32 BGZFController::InflateSize(buffer_type& input) const{ - const header_type& header = *reinterpret_cast(&input.data[0]); + const header_type& header = *reinterpret_cast(input.data()); if(!header.Validate()){ std::cerr << Helpers::timestamp("ERROR","BGZF") << "Invalid BGZF header" << std::endl; std::cerr << Helpers::timestamp("DEBUG","BGZF") << "Output length: " << header.BSIZE << std::endl; @@ -49,7 +49,7 @@ bool BGZFController::Inflate(buffer_type& input, buffer_type& output, const head } bool BGZFController::__Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{ - const U32& uncompressedLength = *reinterpret_cast(&input.data[input.size() - sizeof(U32)]); + const U32& uncompressedLength = *reinterpret_cast(&input[input.size() - sizeof(U32)]); if(output.size() + uncompressedLength >= output.capacity()) output.resize((output.size() + uncompressedLength) + 65536); @@ -63,9 +63,9 @@ bool BGZFController::__Inflate(buffer_type& input, buffer_type& output, const he z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; - zs.next_in = (Bytef*)&input.data[Constants::BGZF_BLOCK_HEADER_LENGTH]; + zs.next_in = (Bytef*)&input[Constants::BGZF_BLOCK_HEADER_LENGTH]; zs.avail_in = (header.BSIZE + 1) - 16; - zs.next_out = (Bytef*)&output.data[output.pointer]; + zs.next_out = (Bytef*)&output[output.size()]; zs.avail_out = (U32)avail_out; int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); @@ -94,16 +94,16 @@ bool BGZFController::__Inflate(buffer_type& input, buffer_type& output, const he //if(zs.total_out == 0) // std::cerr << Helpers::timestamp("LOG", "BGZF") << "Detected empty BGZF block" << std::endl; - output.pointer += zs.total_out; + output.n_chars += zs.total_out; return(true); } bool BGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){ input.resize(sizeof(header_type)); - stream.read(&input.data[0], IO::Constants::BGZF_BLOCK_HEADER_LENGTH); - const header_type* h = reinterpret_cast(&input.data[0]); - input.pointer = IO::Constants::BGZF_BLOCK_HEADER_LENGTH; + stream.read(input.data(), IO::Constants::BGZF_BLOCK_HEADER_LENGTH); + const header_type* h = reinterpret_cast(input.data()); + input.n_chars = IO::Constants::BGZF_BLOCK_HEADER_LENGTH; if(!h->Validate()){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "BCF") << "Failed to validate!" << std::endl; std::cerr << *h << std::endl; @@ -114,16 +114,16 @@ bool BGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){ // Recast because if buffer is resized then the pointer address is incorrect // resulting in segfault - h = reinterpret_cast(&input.data[0]); + h = reinterpret_cast(input.data()); - stream.read(&input.data[IO::Constants::BGZF_BLOCK_HEADER_LENGTH], (h->BSIZE + 1) - IO::Constants::BGZF_BLOCK_HEADER_LENGTH); + stream.read(&input[IO::Constants::BGZF_BLOCK_HEADER_LENGTH], (h->BSIZE + 1) - IO::Constants::BGZF_BLOCK_HEADER_LENGTH); if(!stream.good()){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "BCF") << "Truncated file..." << std::endl; return false; } - input.pointer = h->BSIZE + 1; - const U32 uncompressed_size = *reinterpret_cast(&input[input.pointer - sizeof(U32)]); + input.n_chars = h->BSIZE + 1; + const U32 uncompressed_size = *reinterpret_cast(&input[input.size() - sizeof(U32)]); this->buffer.resize(uncompressed_size + 1); this->buffer.reset(); diff --git a/src/io/compression/BGZFController.h b/src/io/compression/BGZFController.h index ff7a368..5082ec4 100644 --- a/src/io/compression/BGZFController.h +++ b/src/io/compression/BGZFController.h @@ -27,7 +27,7 @@ class BGZFController { bool InflateBlock(std::ifstream& stream, buffer_type& input); friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream.write(entry.buffer.data, entry.buffer.pointer); + stream.write(entry.buffer.data(), entry.buffer.size()); return stream; } diff --git a/src/io/compression/GZFConstants.h b/src/io/compression/GZFConstants.h index 9395868..1cb4c62 100644 --- a/src/io/compression/GZFConstants.h +++ b/src/io/compression/GZFConstants.h @@ -1,7 +1,7 @@ #ifndef GZFCONSTANTS_H_ #define GZFCONSTANTS_H_ -#include "../../support/TypeDefinitions.h" +#include "../../support/type_definitions.h" namespace Tomahawk{ namespace IO{ @@ -22,11 +22,11 @@ const BYTE BGZF_ID1 = 66; const BYTE BGZF_ID2 = 67; const BYTE BGZF_LEN = 2; -const SBYTE GZIP_WINDOW_BITS = -15; -const SBYTE Z_DEFAULT_MEM_LEVEL = 8; -const BYTE TGZF_BLOCK_HEADER_LENGTH = 20; -const BYTE TGZF_BLOCK_FOOTER_LENGTH = 8; -const BYTE BGZF_BLOCK_HEADER_LENGTH = 18; +const SBYTE GZIP_WINDOW_BITS = -15; +const SBYTE Z_DEFAULT_MEM_LEVEL = 8; +const BYTE TGZF_BLOCK_HEADER_LENGTH = 20; +const BYTE TGZF_BLOCK_FOOTER_LENGTH = 8; +const BYTE BGZF_BLOCK_HEADER_LENGTH = 18; } } diff --git a/src/io/compression/GZFHeader.h b/src/io/compression/GZFHeader.h index 12de6b6..f4b4d61 100644 --- a/src/io/compression/GZFHeader.h +++ b/src/io/compression/GZFHeader.h @@ -6,8 +6,8 @@ namespace Tomahawk{ namespace IO{ -#pragma pack(1) -struct __headerBase{ +#pragma pack(push, 1) +struct __attribute__((packed, aligned(1))) __headerBase{ private: typedef __headerBase self_type; @@ -72,8 +72,7 @@ struct __headerBase{ block to 2^32 bytes and adds and an extra "BC" field in the gzip header which records the size. */ -#pragma pack(1) -struct TGZFHeader : public __headerBase{ +struct __attribute__((packed, aligned(1))) TGZFHeader : public __headerBase{ private: typedef TGZFHeader self_type; typedef __headerBase parent_type; @@ -121,8 +120,7 @@ struct TGZFHeader : public __headerBase{ block to 2^16 bytes and adds and an extra "BC" field in the gzip header which records the size. */ -#pragma pack(1) -struct BGZFHeader : public __headerBase{ +struct __attribute__((packed, aligned(1))) BGZFHeader : public __headerBase{ private: typedef BGZFHeader self_type; typedef __headerBase parent_type; @@ -157,6 +155,8 @@ struct BGZFHeader : public __headerBase{ } }; +#pragma pack(pop) + } } diff --git a/src/io/compression/TGZFController.cpp b/src/io/compression/TGZFController.cpp index 5351578..6e37d47 100644 --- a/src/io/compression/TGZFController.cpp +++ b/src/io/compression/TGZFController.cpp @@ -38,7 +38,7 @@ bool TGZFController::Inflate(buffer_type& input, buffer_type& output, const head } bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{ - const U32& uncompressedLength = *reinterpret_cast(&input.data[input.size() - sizeof(U32)]); + const U32& uncompressedLength = *reinterpret_cast(&input[input.size() - sizeof(U32)]); if(output.size() + uncompressedLength >= output.capacity()) output.resize((output.size() + uncompressedLength) + 65536); @@ -53,9 +53,9 @@ bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const he z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; - zs.next_in = (Bytef*)&input.data[Constants::TGZF_BLOCK_HEADER_LENGTH]; + zs.next_in = (Bytef*)&input[Constants::TGZF_BLOCK_HEADER_LENGTH]; zs.avail_in = (header.BSIZE + 1) - 16; - zs.next_out = (Bytef*)&output.data[output.pointer]; + zs.next_out = (Bytef*)&output[output.size()]; zs.avail_out = (U32)avail_out; int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); @@ -84,7 +84,7 @@ bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const he if(zs.total_out == 0) std::cerr << Helpers::timestamp("LOG", "TGZF") << "Detected empty TGZF block" << std::endl; - output.pointer += zs.total_out; + output.n_chars += zs.total_out; return(true); } @@ -92,7 +92,7 @@ bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const he bool TGZFController::Deflate(const buffer_type& buffer){ this->buffer.resize(buffer); - memset(this->buffer.data, 0, Constants::TGZF_BLOCK_HEADER_LENGTH); + memset(this->buffer.data(), 0, Constants::TGZF_BLOCK_HEADER_LENGTH); this->buffer[0] = Constants::GZIP_ID1; this->buffer[1] = Constants::GZIP_ID2; @@ -113,8 +113,8 @@ bool TGZFController::Deflate(const buffer_type& buffer){ z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; - zs.next_in = (Bytef*)buffer.data; - zs.avail_in = buffer.pointer; + zs.next_in = (Bytef*)buffer.data(); + zs.avail_in = buffer.size(); zs.next_out = (Bytef*)&this->buffer[Constants::TGZF_BLOCK_HEADER_LENGTH]; zs.avail_out = this->buffer.width - Constants::TGZF_BLOCK_HEADER_LENGTH - @@ -169,18 +169,18 @@ bool TGZFController::Deflate(const buffer_type& buffer){ //std::cerr << Helpers::timestamp("DEBUG") << "Time: " << *time << std::endl; - memset(&buffer.data[compressedLength - Constants::TGZF_BLOCK_FOOTER_LENGTH], 0, Constants::TGZF_BLOCK_FOOTER_LENGTH); + memset(&buffer.buffer[compressedLength - Constants::TGZF_BLOCK_FOOTER_LENGTH], 0, Constants::TGZF_BLOCK_FOOTER_LENGTH); // store the CRC32 checksum U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)buffer.data, buffer.pointer); + crc = crc32(crc, (Bytef*)buffer.data(), buffer.size()); U32* c = reinterpret_cast(&this->buffer[compressedLength - Constants::TGZF_BLOCK_FOOTER_LENGTH]); *c = crc; - U32 convert = buffer.pointer; // avoid potential problems when casting from U64 to U32 by interpretation + U32 convert = buffer.size(); // avoid potential problems when casting from U64 to U32 by interpretation U32* uncompressed = reinterpret_cast(&this->buffer[compressedLength - sizeof(U32)]); *uncompressed = convert; // Store uncompressed length - this->buffer.pointer = compressedLength; + this->buffer.n_chars = compressedLength; //std::cerr << "Writing: " << convert << '/' << *uncompressed << '\t' << compressedLength << '\t' << *test << '\t' << buffer.size() << '\t' << "At pos: " << (compressedLength - sizeof(U32)) << '\t' << buffer.pointer << '\t' << *c << '\t' << convert << std::endl; return true; @@ -191,11 +191,11 @@ bool TGZFController::Deflate(buffer_type& meta, buffer_type& rle){ return(this->Deflate(meta)); } -bool TGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){ +bool TGZFController::InflateBlock(std::istream& stream, buffer_type& input){ input.resize(sizeof(header_type)); - stream.read(&input.data[0], IO::Constants::TGZF_BLOCK_HEADER_LENGTH); - const header_type* h = reinterpret_cast(&input.data[0]); - input.pointer = IO::Constants::TGZF_BLOCK_HEADER_LENGTH; + stream.read(input.data(), IO::Constants::TGZF_BLOCK_HEADER_LENGTH); + const header_type* h = reinterpret_cast(input.data()); + input.n_chars = IO::Constants::TGZF_BLOCK_HEADER_LENGTH; if(!h->Validate()){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TGZF") << "Failed to validate!" << std::endl; std::cerr << *h << std::endl; @@ -206,16 +206,16 @@ bool TGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){ // Recast because if buffer is resized then the pointer address is incorrect // resulting in segfault - h = reinterpret_cast(&input.data[0]); + h = reinterpret_cast(input.data()); - stream.read(&input.data[IO::Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - IO::Constants::TGZF_BLOCK_HEADER_LENGTH); + stream.read(&input[IO::Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - IO::Constants::TGZF_BLOCK_HEADER_LENGTH); if(!stream.good()){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TGZF") << "Truncated file..." << std::endl; return false; } - input.pointer = h->BSIZE; - const U32 uncompressed_size = *reinterpret_cast(&input[input.pointer - sizeof(U32)]); + input.n_chars = h->BSIZE; + const U32 uncompressed_size = *reinterpret_cast(&input[input.size() - sizeof(U32)]); this->buffer.resize(uncompressed_size); this->buffer.reset(); diff --git a/src/io/compression/TGZFController.h b/src/io/compression/TGZFController.h index bc9dd03..1b31f14 100644 --- a/src/io/compression/TGZFController.h +++ b/src/io/compression/TGZFController.h @@ -29,13 +29,13 @@ class TGZFController{ void Clear(); bool Inflate(buffer_type& input, buffer_type& output, const header_type& header) const; bool Inflate(buffer_type& input, buffer_type& output) const; - bool InflateBlock(std::ifstream& stream, buffer_type& input); + bool InflateBlock(std::istream& stream, buffer_type& input); bool Deflate(const buffer_type& buffer); bool Deflate(buffer_type& meta, buffer_type& rle); friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream.write(entry.buffer.data, entry.buffer.pointer); + stream.write(entry.buffer.data(), entry.buffer.size()); return stream; } diff --git a/src/io/compression/TGZFControllerStream.cpp b/src/io/compression/TGZFControllerStream.cpp index 4c04aeb..817f96c 100644 --- a/src/io/compression/TGZFControllerStream.cpp +++ b/src/io/compression/TGZFControllerStream.cpp @@ -13,8 +13,8 @@ bool TGZFControllerStream::InflateOpen(std::ifstream& stream){ this->buffer.reset(); this->buffer.resize(this->chunk_size); this->bytes_read = 0; - stream.read(&this->buffer.data[0], IO::Constants::TGZF_BLOCK_HEADER_LENGTH); - const header_type* h = reinterpret_cast(&this->buffer.data[0]); + stream.read(this->buffer.data(), IO::Constants::TGZF_BLOCK_HEADER_LENGTH); + const header_type* h = reinterpret_cast(this->buffer.data()); if(!h->Validate()){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TGZF") << "Failed to validate!" << std::endl; @@ -80,19 +80,19 @@ bool TGZFControllerStream::__Inflate(std::ifstream& stream, const BYTE* output, if(this->bytes_read + this->chunk_size > this->BSIZE) read_amount = this->BSIZE - this->bytes_read; - stream.read(&this->buffer.data[0], read_amount); + stream.read(this->buffer.data(), read_amount); size_t total = stream.gcount(); this->bytes_read += total; //std::cerr << "READ: " << total << "\t" << stream.tellg() << std::endl; this->d_stream.avail_in = total; - this->d_stream.next_in = (Bytef*)&this->buffer.data[0]; + this->d_stream.next_in = (Bytef*)this->buffer.data(); if(total == 0){ std::cerr << Helpers::timestamp("WARNING","TGZF") << "Nothing read!" << std::endl; return false; } - this->buffer.pointer = total; + this->buffer.n_chars = total; } const U32 tot_out = this->d_stream.total_out; diff --git a/src/io/compression/TGZFEntryIterator.h b/src/io/compression/TGZFEntryIterator.h index cc04380..0a217cf 100644 --- a/src/io/compression/TGZFEntryIterator.h +++ b/src/io/compression/TGZFEntryIterator.h @@ -74,7 +74,7 @@ bool TGZFEntryIterator::nextEntry(const T*& entry){ } U32 ret_size = 0; - if(!parent_type::Inflate(this->stream, (BYTE*)&output_buffer.data[0], this->chunk_size, ret_size)){ + if(!parent_type::Inflate(this->stream, (BYTE*)output_buffer.data(), this->chunk_size, ret_size)){ if(this->STATE != TGZF_STATE::TGZF_END){ std::cerr << Helpers::timestamp("ERROR","TGZF") << "Invalid state (" << this->STATE << ")" << std::endl; exit(1); @@ -97,7 +97,7 @@ bool TGZFEntryIterator::nextEntry(const T*& entry){ this->reset(); // reset state } - if(!parent_type::Inflate(this->stream, (BYTE*)&output_buffer.data[0], this->chunk_size, ret_size)){ + if(!parent_type::Inflate(this->stream, (BYTE*)output_buffer.data(), this->chunk_size, ret_size)){ if(this->STATE != TGZF_STATE::TGZF_END){ std::cerr << Helpers::timestamp("ERROR","TGZF") << "Invalid state (" << this->STATE << ")" << std::endl; exit(1); @@ -111,10 +111,10 @@ bool TGZFEntryIterator::nextEntry(const T*& entry){ } - this->output_buffer.pointer = ret_size; - this->n_entries = ret_size / sizeof(T); - this->pointer = 0; - this->entries = reinterpret_cast(this->output_buffer.data); + this->output_buffer.n_chars = ret_size; + this->n_entries = ret_size / sizeof(T); + this->pointer = 0; + this->entries = reinterpret_cast(this->output_buffer.data()); } entry = &this->entries[this->pointer++]; diff --git a/src/io/output_writer.cpp b/src/io/output_writer.cpp new file mode 100644 index 0000000..ebd2643 --- /dev/null +++ b/src/io/output_writer.cpp @@ -0,0 +1,187 @@ +#include "output_writer.h" + +namespace Tomahawk{ +namespace IO{ + +OutputWriter::OutputWriter(void) : + owns_pointers(true), + writing_sorted_(false), + writing_sorted_partial_(false), + n_entries(0), + n_progress_count(0), + n_blocks(0), + l_flush_limit(2000000), + l_largest_uncompressed(0), + stream(nullptr), + buffer(this->l_flush_limit*2), + spin_lock(new spin_lock_type), + index_(new index_type), + footer_(new footer_type) +{ + +} + +OutputWriter::OutputWriter(std::string input_file) : + owns_pointers(true), + writing_sorted_(false), + writing_sorted_partial_(false), + n_entries(0), + n_progress_count(0), + n_blocks(0), + l_flush_limit(2000000), + l_largest_uncompressed(0), + stream(new std::ofstream(input_file, std::ios::binary | std::ios::out)), + buffer(this->l_flush_limit*2), + spin_lock(new spin_lock_type), + index_(new index_type), + footer_(new footer_type) +{ + +} + +OutputWriter::OutputWriter(const self_type& other) : + owns_pointers(false), + writing_sorted_(other.writing_sorted_), + writing_sorted_partial_(other.writing_sorted_partial_), + n_entries(other.n_entries), + n_progress_count(other.n_progress_count), + n_blocks(other.n_blocks), + l_flush_limit(other.l_flush_limit), + l_largest_uncompressed(0), + stream(other.stream), + buffer(other.buffer.capacity()), + spin_lock(other.spin_lock), + index_(other.index_), + footer_(other.footer_) +{ + +} + +OutputWriter::~OutputWriter(void){ + if(this->owns_pointers){ + this->stream->flush(); + this->stream->close(); + delete this->stream; + delete this->spin_lock; + delete this->index_; + delete this->footer_; + } +} + +bool OutputWriter::open(const std::string& output_file){ + if(output_file.size() == 0) + return false; + + this->CheckOutputNames(output_file); + this->filename = output_file; + + this->stream = new std::ofstream(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX, std::ios::binary | std::ios::out); + if(this->stream->good() == false){ + std::cerr << "Failed to open: " << output_file << std::endl; + return false; + } + + return true; +} + +int OutputWriter::writeHeaders(twk_header_type& twk_header){ + const std::string command = "##tomahawk_calcCommand=" + Helpers::program_string(); + twk_header.getLiterals() += command; + // Set file type to TWO + twk_header.magic_.file_type = 1; + + return(twk_header.write(*this->stream)); +} + +void OutputWriter::writeFinal(void){ + this->footer_->l_largest_uncompressed = this->l_largest_uncompressed; + this->footer_->offset_end_of_data = this->stream->tellp(); + this->index_->setSorted(this->isSorted()); + this->index_->setPartialSorted(this->isPartialSorted()); + + this->stream->flush(); + *this->stream << *this->index_; + *this->stream << *this->footer_; + this->stream->flush(); +} + +void OutputWriter::flush(void){ + if(this->buffer.size() > 0){ + if(!this->compressor.Deflate(this->buffer)){ + std::cerr << Helpers::timestamp("ERROR","TGZF") << "Failed deflate DATA..." << std::endl; + exit(1); + } + + if(this->buffer.size() > l_largest_uncompressed) + this->l_largest_uncompressed = this->buffer.size(); + + this->spin_lock->lock(); + this->index_entry.byte_offset = (U64)this->stream->tellp(); + this->index_entry.uncompressed_size = this->buffer.size(); + this->stream->write(this->compressor.buffer.data(), this->compressor.buffer.size()); + this->index_entry.byte_offset_end = (U64)this->stream->tellp(); + this->index_entry.n_variants = this->buffer.size() / sizeof(entry_type); + //*this->stream << this->index_entry; + this->index_->getContainer() += this->index_entry; + //std::cerr << this->index_entry.byte_offset_from << "->" << this->index_entry.byte_offset_to << " for " << this->index_entry.n_entries << " of " << this->index_entry.uncompressed_size << std::endl; + ++this->n_blocks; + + this->spin_lock->unlock(); + + this->buffer.reset(); + this->compressor.Clear(); + this->index_entry.reset(); + } +} + +void OutputWriter::operator<<(const container_type& container){ + for(size_type i = 0; i < container.size(); ++i) + this->buffer << container[i]; + + this->n_entries += buffer.size() / sizeof(entry_type); + *this << this->buffer; +} + +void OutputWriter::operator<<(buffer_type& buffer){ + if(buffer.size() > 0){ + if(!this->compressor.Deflate(buffer)){ + std::cerr << Helpers::timestamp("ERROR","TGZF") << "Failed deflate DATA..." << std::endl; + exit(1); + } + + if(buffer.size() > l_largest_uncompressed) + this->l_largest_uncompressed = buffer.size(); + + // Lock + this->spin_lock->lock(); + + this->index_entry.byte_offset = (U64)this->stream->tellp(); + this->index_entry.uncompressed_size = buffer.size(); + this->stream->write(this->compressor.buffer.data(), this->compressor.buffer.size()); + this->index_entry.byte_offset_end = (U64)this->stream->tellp(); + this->index_entry.n_variants = buffer.size() / sizeof(entry_type); + this->index_->getContainer() += this->index_entry; + ++this->n_blocks; + + // Unlock + this->spin_lock->unlock(); + + buffer.reset(); + this->compressor.Clear(); + this->index_entry.reset(); + } +} + +void OutputWriter::CheckOutputNames(const std::string& input){ + std::vector paths = Helpers::filePathBaseExtension(input); + this->basePath = paths[0]; + if(this->basePath.size() > 0) + this->basePath += '/'; + + if(paths[3].size() == Tomahawk::Constants::OUTPUT_LD_SUFFIX.size() && strncasecmp(&paths[3][0], &Tomahawk::Constants::OUTPUT_LD_SUFFIX[0], Tomahawk::Constants::OUTPUT_LD_SUFFIX.size()) == 0) + this->baseName = paths[2]; + else this->baseName = paths[1]; +} + +} +} diff --git a/src/io/output_writer.h b/src/io/output_writer.h new file mode 100644 index 0000000..d9c049c --- /dev/null +++ b/src/io/output_writer.h @@ -0,0 +1,179 @@ +#ifndef IO_OUTPUT_WRITER_H_ +#define IO_OUTPUT_WRITER_H_ + +#include "../support/MagicConstants.h" +#include "../support/simd_definitions.h" +#include "../support/helpers.h" +#include "../io/compression/TGZFController.h" +#include "../algorithm/spinlock.h" +#include "../index/tomahawk_header.h" +#include "../index/index_entry.h" +#include "../tomahawk/output_container.h" +#include "../tomahawk/two/output_entry_support.h" +#include "../tomahawk/meta_entry.h" +#include "../index/index_entry.h" +#include "../index/index_container.h" +#include "../index/index.h" +#include "../index/footer.h" + +namespace Tomahawk{ +namespace IO{ + +/**< + * Writer class for `two` entries. This class supports parallel writing + * with the use of a lock-free spin-lock (requires C++11 because of the use + * of atomic values). In parallel computing, each slave constructs their own + * OutputWriter by invoking the copy-ctor and borrowing pointers from the + * main instance. + */ +class OutputWriter{ +private: + typedef OutputWriter self_type; + typedef TGZFController compression_type; + typedef Algorithm::SpinLock spin_lock_type; + typedef BasicBuffer buffer_type; + typedef TomahawkHeader twk_header_type; + typedef Totempole::IndexEntry index_entry_type; + typedef OutputEntry entry_type; + typedef Support::OutputEntrySupport entry_support_type; + typedef Totempole::IndexEntry header_entry_type; + typedef Totempole::IndexContainer index_container_type; + typedef Index index_type; + typedef Totempole::Footer footer_type; + typedef size_t size_type; + typedef OutputContainer container_type; + +public: + OutputWriter(void); + OutputWriter(std::string input_file); + OutputWriter(const self_type& other); + ~OutputWriter(void); + + inline const U64& sizeEntries(void) const{ return(this->n_entries); } + inline const U32& sizeBlocks(void) const{ return(this->n_blocks); } + + // Setters + inline void setSorted(const bool yes){ this->writing_sorted_ = yes; } + inline void setPartialSorted(const bool yes){ this->writing_sorted_partial_ = yes; } + inline void setFlushLimit(const U32 limit){ this->l_flush_limit = limit; } + + // Getters + inline const bool isSorted(void) const{ return(this->writing_sorted_); } + inline const bool isPartialSorted(void) const{ return(this->writing_sorted_partial_); } + + bool open(const std::string& output_file); + int writeHeaders(twk_header_type& twk_header); + void writeFinal(void); + void flush(void); + + inline void ResetProgress(void){ this->n_progress_count = 0; } + inline const U32& getProgressCounts(void) const{ return this->n_progress_count; } + + inline self_type& operator+=(const self_type& other){ + this->n_entries += other.n_entries; + this->n_blocks += other.n_blocks; + if(other.l_largest_uncompressed > this->l_largest_uncompressed) + this->l_largest_uncompressed = other.l_largest_uncompressed; + + return(*this); + } + + inline self_type& operator=(const self_type& other){ + this->n_blocks = other.n_blocks; + this->n_entries = other.n_entries; + if(other.l_largest_uncompressed > this->l_largest_uncompressed) + this->l_largest_uncompressed = other.l_largest_uncompressed; + return(*this); + } + + /**< + * Primary function writing `two` entries to disk after being computed by a + * slave. + * @param meta_a Meta information for the from container + * @param meta_b Meta information for the to container + * @param header_a Tomahawk index entry for the from container + * @param header_b Tomahawk index entry for the to container + * @param helper Helper structure used in computing LD. Holds the allele/genotype counts and statistics + */ + template + void Add(const MetaEntry& meta_a, const MetaEntry& meta_b, const header_entry_type& header_a, const header_entry_type& header_b, const entry_support_type& helper); + + /**< + * Overloaded operator for adding a single `two` entry + * @param entry Input `two` entry + */ + inline void operator<<(const entry_type& entry){ + this->buffer << entry; + ++this->n_entries; + + // Check if the buffer has to be flushed after adding this entry + if(this->buffer.size() > this->l_flush_limit) + this->flush(); + } + + /**< + * Overloaded operator for adding an entire container of `two` entries + * @param container Target container of entries + */ + void operator<<(const container_type& container); + + /**< + * Overloaded operator for adding an entire buffer of `two` entries + * @param buffer Target buffer of entries + */ + void operator<<(buffer_type& buffer); + +private: + void CheckOutputNames(const std::string& input); + +private: + std::string filename; + std::string basePath; + std::string baseName; + bool owns_pointers; + bool writing_sorted_; + bool writing_sorted_partial_; + U64 n_entries; // number of entries written + U32 n_progress_count; // lines added since last flush + U32 n_blocks; // number of index blocks writtenflush_limit + U32 l_flush_limit; + U32 l_largest_uncompressed; + index_entry_type index_entry; // keep track of sort order + std::ofstream* stream; + buffer_type buffer; + compression_type compressor; + spin_lock_type* spin_lock; + index_type* index_; + footer_type* footer_; +}; + +template +void OutputWriter::Add(const MetaEntry& meta_a, const MetaEntry& meta_b, const header_entry_type& header_a, const header_entry_type& header_b, const entry_support_type& helper){ + const U32 writePosA = meta_a.position << 2 | meta_a.phased << 1 | meta_a.missing; + const U32 writePosB = meta_b.position << 2 | meta_b.phased << 1 | meta_b.missing; + this->buffer += helper.controller; + this->buffer += header_a.contigID; + this->buffer += writePosA; + this->buffer += header_b.contigID; + this->buffer += writePosB; + this->buffer << helper; + // Add reverse + this->buffer += helper.controller; + this->buffer += header_b.contigID; + this->buffer += writePosB; + this->buffer += header_a.contigID; + this->buffer += writePosA; + this->buffer << helper; + + this->n_entries += 2; + this->n_progress_count += 2; + this->index_entry.n_variants += 2; + + if(this->buffer.size() > this->l_flush_limit) + this->flush(); +} + +} +} + +#endif /* IO_OUTPUT_WRITER_H_ */ diff --git a/src/io/reader.cpp b/src/io/reader.cpp index 8050213..9ec2ca3 100644 --- a/src/io/reader.cpp +++ b/src/io/reader.cpp @@ -3,9 +3,31 @@ namespace Tomahawk{ -reader::reader() : filesize_(0), block_size_(65536), capacity_(this->block_size_*2), end_(0), buffer_(new type[this->capacity_]){} -reader::reader(std::string input) : filename_(input), filesize_(0), block_size_(65536), capacity_(this->block_size_*2), end_(0), buffer_(new type[this->capacity_]){} -reader::reader(std::string input, const size_t block_size) : filename_(input), filesize_(0), block_size_(block_size), capacity_(this->block_size_*2), end_(0), buffer_(new type[this->capacity_]){} +reader::reader() : + filesize_(0), + block_size_(65536), + capacity_(this->block_size_*2), + end_(0), + buffer_(new type[this->capacity_]) +{} + +reader::reader(std::string input) : + filename_(input), + filesize_(0), + block_size_(65536), + capacity_(this->block_size_*2), + end_(0), + buffer_(new type[this->capacity_]) +{} + +reader::reader(std::string input, const size_t block_size) : + filename_(input), + filesize_(0), + block_size_(block_size), + capacity_(this->block_size_*2), + end_(0), + buffer_(new type[this->capacity_]) +{} bool reader::open(std::string filename){ // If filename is empty diff --git a/src/io/reader.h b/src/io/reader.h index f54225c..8979921 100644 --- a/src/io/reader.h +++ b/src/io/reader.h @@ -1,3 +1,6 @@ +#ifndef BASIC_READER_H_ +#define BASIC_READER_H_ + #include #include #include @@ -7,9 +10,6 @@ #include "../support/MagicConstants.h" // for SILENT -#ifndef READER_H_ -#define READER_H_ - namespace Tomahawk{ /* @@ -122,4 +122,4 @@ class reader { } -#endif /* READER_H_ */ +#endif /* BASIC_READER_H_ */ diff --git a/src/io/vcf/VCFHeader.h b/src/io/vcf/VCFHeader.h index 62e6f11..63f57dd 100644 --- a/src/io/vcf/VCFHeader.h +++ b/src/io/vcf/VCFHeader.h @@ -8,8 +8,9 @@ #include "VCFHeaderContig.h" #include "VCFHeaderLine.h" #include "../reader.h" -#include "../../algorithm/OpenHashTable.h" -#include "../../totempole/TotempoleReader.h" +#include "../../algorithm/open_hashtable.h" +#include "../BasicBuffer.h" +#include "../compression/TGZFController.h" namespace Tomahawk { namespace VCF{ @@ -18,7 +19,6 @@ class VCFHeader { typedef VCFHeader self_type; typedef Tomahawk::Hash::HashTable hash_table; typedef VCFHeaderContig contig_type; - typedef Totempole::TotempoleReader totempole_type; typedef IO::TGZFController tgzf_type; typedef IO::BasicBuffer buffer_type; @@ -27,19 +27,6 @@ class VCFHeader { public: VCFHeader(); ~VCFHeader(); - void operator=(const totempole_type& other){ - this->samples = other.getHeader().samples; - this->version = other.getHeader().version; - this->contigsHashTable = other.getContigHTablePointer(); - this->sampleHashTable = other.getSampleHTablePointer(); - - this->contigs = std::vector(other.n_contigs); - for(U32 i = 0; i < other.n_contigs; ++i){ - this->contigs[i].name = other.contigs[i].name; - this->contigs[i].length = other.contigs[i].bases; - this->contigs[i].tomahawkBlocks = other.contigs[i].blocksEnd-other.contigs[i].blocksStart; - } - } void unsetBorrowedPointers(void){ this->contigsHashTable = nullptr; @@ -76,7 +63,7 @@ class VCFHeader { temp += this->literal_lines[i]; } tgzf_controller.Deflate(temp); - stream.write(&temp.data[0], temp.size()); + stream.write(temp.data(), temp.size()); tgzf_controller.Clear(); temp.deleteAll(); diff --git a/src/io/vcf/VCFHeaderConstants.h b/src/io/vcf/VCFHeaderConstants.h index 2ecd7b4..1547f03 100644 --- a/src/io/vcf/VCFHeaderConstants.h +++ b/src/io/vcf/VCFHeaderConstants.h @@ -3,7 +3,7 @@ #define VCFHEADERCONSTANTS_H_ #include -#include "../../support/TypeDefinitions.h" +#include "../../support/type_definitions.h" namespace Tomahawk{ namespace VCF{ diff --git a/src/io/vcf/VCFLines.h b/src/io/vcf/VCFLines.h index 90cbff3..01a75af 100644 --- a/src/io/vcf/VCFLines.h +++ b/src/io/vcf/VCFLines.h @@ -15,8 +15,8 @@ namespace Tomahawk{ namespace VCF{ // -#pragma pack(1) -struct VCFDiploidGenotype{ +#pragma pack(push, 1) +struct __attribute__((packed, aligned(1))) VCFDiploidGenotype{ public: VCFDiploidGenotype(); // Has no ctor or dtor ~VCFDiploidGenotype(); @@ -33,6 +33,7 @@ struct VCFDiploidGenotype{ return stream; } }; +#pragma pack(pop) class VCFLineDataInterface{ public: diff --git a/src/main.cpp b/src/main.cpp index a665af2..d53eb9e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -28,7 +28,6 @@ DEALINGS IN THE SOFTWARE. #include "import.h" #include "view.h" #include "sort.h" -#include "index.h" #include "concat.h" #include "stats.h" @@ -61,11 +60,6 @@ int main(int argc, char** argv){ } else if(strncmp(&argv[1][0], "sort", 4) == 0){ return(sort(argc, argv)); - } else if(strncmp(&argv[1][0], "index", 5) == 0){ - //return(index(argc, argv)); - std::cerr << "Not implemented" << std::endl; - return(1); - } else if(strncmp(&argv[1][0], "concat", 6) == 0){ return(concat(argc, argv)); diff --git a/src/math/FisherMath.cpp b/src/math/FisherMath.cpp deleted file mode 100644 index 465fffb..0000000 --- a/src/math/FisherMath.cpp +++ /dev/null @@ -1,9 +0,0 @@ -#include "FisherMath.h" - -namespace Tomahawk { -namespace Algorithm{ - - - -} -} /* namespace Tomahawk */ diff --git a/src/math/FisherMath.h b/src/math/FisherMath.h deleted file mode 100644 index d01b624..0000000 --- a/src/math/FisherMath.h +++ /dev/null @@ -1,192 +0,0 @@ -#ifndef FISHERTEST_H_ -#define FISHERTEST_H_ - -#include -#include -#include -#include -#include "../support/TypeDefinitions.h" - -namespace Tomahawk { -namespace Algorithm { - -#define KF_GAMMA_EPS 1e-14 -#define KF_TINY 1e-290 -#define FISHER_TINY 1e-279 -#define STIRLING_CONSTANT 0.5 * log(2 * M_PI) - -class FisherMath{ - -public: - FisherMath(const U32 number) : - number(number+1), - logN_values(number, 0) - { - this->Build(); - } - - ~FisherMath(){} - - void Build(void){ - double factorial = 0; - this->logN_values[0] = 0; - for(U32 i = 1; i < this->number; ++i){ - factorial += log(i); - this->logN_values[i] = factorial; - } - } - - /* Log gamma function - * \log{\Gamma(z)} - * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 - */ - double kf_lgamma(double z) const{ - double x = 0; - x += 0.1659470187408462e-06 / (z+7); - x += 0.9934937113930748e-05 / (z+6); - x -= 0.1385710331296526 / (z+5); - x += 12.50734324009056 / (z+4); - x -= 176.6150291498386 / (z+3); - x += 771.3234287757674 / (z+2); - x -= 1259.139216722289 / (z+1); - x += 676.5203681218835 / z; - x += 0.9999999999995183; - return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5); - } - - // regularized lower incomplete gamma function, by series expansion - double _kf_gammap(double s, double z) const{ - double sum, x; - int k; - for (k = 1, sum = x = 1.; k < 100; ++k) { - sum += (x *= z / (s + k)); - if (x / sum < KF_GAMMA_EPS) break; - } - return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum)); - } - - // regularized upper incomplete gamma function, by continued fraction - double _kf_gammaq(double s, double z) const{ - int j; - double C, D, f; - f = 1. + z - s; C = f; D = 0.; - // Modified Lentz's algorithm for computing continued fraction - // See Numerical Recipes in C, 2nd edition, section 5.2 - for (j = 1; j < 100; ++j) { - double a = j * (s - j), b = (j<<1) + 1 + z - s, d; - D = b + a * D; - if (D < KF_TINY) D = KF_TINY; - C = b + a / C; - if (C < KF_TINY) C = KF_TINY; - D = 1. / D; - d = C * D; - f *= d; - if (fabs(d - 1.) < KF_GAMMA_EPS) break; - } - return exp(s * log(z) - z - kf_lgamma(s) - log(f)); - } - - inline double kf_gammap(double s, double z) const{return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);} - inline double kf_gammaq(double s, double z) const{return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);} - - - __attribute__((always_inline)) - inline double StirlingsApproximation(const double v) const{ - return(STIRLING_CONSTANT + (v - 0.5) * log(v) - v + 1/(12*v) + 1/(360 * v * v * v)); - } - - __attribute__((always_inline)) - inline double logN(const S32& value) const{ - //if(value < this->number) - // return this->logN_values[value]; - //else - //return this->StirlingsApproximation((double)value + 1); - return(StirlingsApproximation((double)value+1)); - } - - __attribute__((always_inline)) - inline double fisherTest(const S32& a, const S32& b, const S32& c, const S32 d) const{ - // Rewrite Fisher's 2x2 test in log form - // return e^x - return(exp(logN(a+b) + logN(c+d) + logN(a+c) + logN(b+d) - logN(a) - logN(b) - logN(c) - logN(d) - logN(a + b + c + d))); - } - - - double fisherTestLess(S32 a, S32 b, S32 c, S32 d) const{ - S32 minValue = a; - if(d < minValue) minValue = d; - - if(minValue > 50) - return(this->chisqr(1, this->chiSquaredTest(a,b,c,d))); - - double sum = 0; - for(S32 i = 0; i <= minValue; ++i){ - sum += this->fisherTest(a, b, c, d); - if(sum < FISHER_TINY) break; - --a, ++b, ++c, --d; - } - - return(sum); - } - - double fisherTestGreater(S32 a, S32 b, S32 c, S32 d) const{ - S32 minValue = b; - if(c < minValue) minValue = c; - - if(minValue > 50) - return(this->chisqr(1, this->chiSquaredTest(a,b,c,d))); - - double sum = 0; - for(S32 i = 0; i <= minValue; ++i){ - sum += this->fisherTest(a, b, c, d); - if(sum < FISHER_TINY) break; - ++a, --b, --c, ++d; - } - - return(sum); - } - - double chisqr(const S32& Dof, const double& Cv) const{ - if(Cv < 0 || Dof < 1) - return 0.0; - - const double K = ((double)Dof) * 0.5; - const double X = Cv * 0.5; - if(Dof == 2) - return exp(-1.0 * X); - - return(this->kf_gammaq(K, X)); - } - - template - double chiSquaredTest(T& a, T& b, T& c, T& d) const{ - const T rowSums[2] = {a+b, c+d}; - const T colSums[2] = {a+c, b+d}; - const double total = a + b + c + d; - double adjustValue = 0; - if(a > 0.5 && b > 0.5 && c > 0.5 && d > 0.5) - adjustValue = 0.5; - - const double chisq = ((pow(a - (double)rowSums[0]*colSums[0]/total,2)-adjustValue)/(rowSums[0]*colSums[0]/total) + - (pow(b - (double)rowSums[0]*colSums[1]/total,2)-adjustValue)/(rowSums[0]*colSums[1]/total) + - (pow(c - (double)rowSums[1]*colSums[0]/total,2)-adjustValue)/(rowSums[1]*colSums[0]/total) + - (pow(d - (double)rowSums[1]*colSums[1]/total,2)-adjustValue)/(rowSums[1]*colSums[1]/total) ); - - if(chisq < 0){ - return ((pow(a - (double)rowSums[0]*colSums[0]/total,2))/(rowSums[0]*colSums[0]/total) + - (pow(b - (double)rowSums[0]*colSums[1]/total,2))/(rowSums[0]*colSums[1]/total) + - (pow(c - (double)rowSums[1]*colSums[0]/total,2))/(rowSums[1]*colSums[0]/total) + - (pow(d - (double)rowSums[1]*colSums[1]/total,2))/(rowSums[1]*colSums[1]/total) ); - } - return chisq; - } - -private: - U32 number; - std::vector logN_values; -}; - -} -} /* namespace Tomahawk */ - -#endif /* FISHERTEST_H_ */ diff --git a/src/math/fisher_math.cpp b/src/math/fisher_math.cpp new file mode 100644 index 0000000..17fc7ae --- /dev/null +++ b/src/math/fisher_math.cpp @@ -0,0 +1,115 @@ +#include "fisher_math.h" + +namespace Tomahawk { +namespace Algorithm{ + +FisherMath::FisherMath(const U32 number) : + number(number+1), + logN_values(number, 0) +{ + this->Build(); +} + +FisherMath::~FisherMath(){} + +void FisherMath::Build(void){ + double factorial = 0; + this->logN_values[0] = 0; + for(U32 i = 1; i < this->number; ++i){ + factorial += log(i); + this->logN_values[i] = factorial; + } +} + +double FisherMath::kf_lgamma(double z) const{ + double x = 0; + x += 0.1659470187408462e-06 / (z+7); + x += 0.9934937113930748e-05 / (z+6); + x -= 0.1385710331296526 / (z+5); + x += 12.50734324009056 / (z+4); + x -= 176.6150291498386 / (z+3); + x += 771.3234287757674 / (z+2); + x -= 1259.139216722289 / (z+1); + x += 676.5203681218835 / z; + x += 0.9999999999995183; + return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5); +} + +double FisherMath::_kf_gammap(double s, double z) const{ + double sum, x; + int k; + for (k = 1, sum = x = 1.; k < 100; ++k) { + sum += (x *= z / (s + k)); + if (x / sum < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum)); +} + +double FisherMath::_kf_gammaq(double s, double z) const{ + int j; + double C, D, f; + f = 1. + z - s; C = f; D = 0.; + // Modified Lentz's algorithm for computing continued fraction + // See Numerical Recipes in C, 2nd edition, section 5.2 + for (j = 1; j < 100; ++j) { + double a = j * (s - j), b = (j<<1) + 1 + z - s, d; + D = b + a * D; + if (D < KF_TINY) D = KF_TINY; + C = b + a / C; + if (C < KF_TINY) C = KF_TINY; + D = 1. / D; + d = C * D; + f *= d; + if (fabs(d - 1.) < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s) - log(f)); +} + +double FisherMath::fisherTestLess(S32 a, S32 b, S32 c, S32 d) const{ + S32 minValue = a; + if(d < minValue) minValue = d; + + if(minValue > 50) + return(this->chisqr(1, this->chiSquaredTest(a,b,c,d))); + + double sum = 0; + for(S32 i = 0; i <= minValue; ++i){ + sum += this->fisherTest(a, b, c, d); + if(sum < FISHER_TINY) break; + --a, ++b, ++c, --d; + } + + return(sum); +} + +double FisherMath::fisherTestGreater(S32 a, S32 b, S32 c, S32 d) const{ + S32 minValue = b; + if(c < minValue) minValue = c; + + if(minValue > 50) + return(this->chisqr(1, this->chiSquaredTest(a,b,c,d))); + + double sum = 0; + for(S32 i = 0; i <= minValue; ++i){ + sum += this->fisherTest(a, b, c, d); + if(sum < FISHER_TINY) break; + ++a, --b, --c, ++d; + } + + return(sum); +} + +double FisherMath::chisqr(const S32& Dof, const double& Cv) const{ + if(Cv < 0 || Dof < 1) + return 0.0; + + const double K = ((double)Dof) * 0.5; + const double X = Cv * 0.5; + if(Dof == 2) + return exp(-1.0 * X); + + return(this->kf_gammaq(K, X)); +} + +} +} /* namespace Tomahawk */ diff --git a/src/math/fisher_math.h b/src/math/fisher_math.h new file mode 100644 index 0000000..242d46e --- /dev/null +++ b/src/math/fisher_math.h @@ -0,0 +1,91 @@ +#ifndef FISHERTEST_H_ +#define FISHERTEST_H_ + +#include +#include +#include +#include +#include "../support/type_definitions.h" + +namespace Tomahawk { +namespace Algorithm { + +#define KF_GAMMA_EPS 1e-14 +#define KF_TINY 1e-290 +#define FISHER_TINY 1e-279 +#define STIRLING_CONSTANT 0.5 * log(2 * M_PI) + +class FisherMath{ + +public: + FisherMath(const U32 number); + ~FisherMath(); + void Build(void); + double kf_lgamma(double z) const; + double _kf_gammap(double s, double z) const; + double _kf_gammaq(double s, double z) const; + + inline double kf_gammap(double s, double z) const{return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);} + inline double kf_gammaq(double s, double z) const{return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);} + + + __attribute__((always_inline)) + inline double StirlingsApproximation(const double v) const{ + return(STIRLING_CONSTANT + (v - 0.5) * log(v) - v + 1/(12*v) + 1/(360 * v * v * v)); + } + + __attribute__((always_inline)) + inline double logN(const S32& value) const{ + return(StirlingsApproximation((double)value+1)); + } + + __attribute__((always_inline)) + inline double fisherTest(const S32& a, const S32& b, const S32& c, const S32 d) const{ + // Rewrite Fisher's 2x2 test in log form + // return e^x + return(exp(logN(a+b) + logN(c+d) + logN(a+c) + logN(b+d) - logN(a) - logN(b) - logN(c) - logN(d) - logN(a + b + c + d))); + } + + + double fisherTestLess(S32 a, S32 b, S32 c, S32 d) const; + double fisherTestGreater(S32 a, S32 b, S32 c, S32 d) const; + double chisqr(const S32& Dof, const double& Cv) const; + + template + double chiSquaredTest(T& a, T& b, T& c, T& d) const; + +private: + U32 number; + std::vector logN_values; +}; + + + + +template +double FisherMath::chiSquaredTest(T& a, T& b, T& c, T& d) const{ + const T rowSums[2] = {a+b, c+d}; + const T colSums[2] = {a+c, b+d}; + const double total = a + b + c + d; + double adjustValue = 0; + if(a > 0.5 && b > 0.5 && c > 0.5 && d > 0.5) + adjustValue = 0.5; + + const double chisq = ((pow(a - (double)rowSums[0]*colSums[0]/total,2)-adjustValue)/(rowSums[0]*colSums[0]/total) + + (pow(b - (double)rowSums[0]*colSums[1]/total,2)-adjustValue)/(rowSums[0]*colSums[1]/total) + + (pow(c - (double)rowSums[1]*colSums[0]/total,2)-adjustValue)/(rowSums[1]*colSums[0]/total) + + (pow(d - (double)rowSums[1]*colSums[1]/total,2)-adjustValue)/(rowSums[1]*colSums[1]/total) ); + + if(chisq < 0){ + return ((pow(a - (double)rowSums[0]*colSums[0]/total,2))/(rowSums[0]*colSums[0]/total) + + (pow(b - (double)rowSums[0]*colSums[1]/total,2))/(rowSums[0]*colSums[1]/total) + + (pow(c - (double)rowSums[1]*colSums[0]/total,2))/(rowSums[1]*colSums[0]/total) + + (pow(d - (double)rowSums[1]*colSums[1]/total,2))/(rowSums[1]*colSums[1]/total) ); + } + return chisq; +} + +} +} /* namespace Tomahawk */ + +#endif /* FISHERTEST_H_ */ diff --git a/src/sort.h b/src/sort.h index 3304a01..c8a2d78 100644 --- a/src/sort.h +++ b/src/sort.h @@ -20,11 +20,10 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "algorithm/sort/output_sorter.h" +#include "tomahawk/two/TomahawkOutputReader.h" #include "utility.h" #include "tomahawk/TomahawkReader.h" -#include "totempole/TotempoleReader.h" -#include "tomahawk/TomahawkOutput/TomahawkOutputReader.h" -#include "algorithm/sort/TomahawkOutputSort.h" void sort_usage(void){ programMessage(); @@ -32,8 +31,7 @@ void sort_usage(void){ "About: Sort TWO files: provides two basic subroutines. If the file is too big to\n" " be sorted in available memory, use the -L option to split the file into\n" " sorted chunks no larger than -L MB in size. Then rerun sort with the -M option\n" - " to perform a k-way merge sort using the partially block-sorted data. Use -d to\n" - " expand A->B to A->B and B->A for accelerated queries.\n" + " to perform a k-way merge sort using the partially block-sorted data.\n" " Note that combining -L and -t incur O(L*t) memory!\n" "Usage: " << Tomahawk::Constants::PROGRAM_NAME << " sort [options] \n\n" "Options:\n" @@ -42,8 +40,7 @@ void sort_usage(void){ " -L FLOAT memory limit in MB (default: 100)\n" " -t INT threads (default: " + std::to_string(std::thread::hardware_concurrency()) + ")\n" " -M merge [null]\n" - " -D expand data (requires O(2n) memory). Is required for indexing [null]\n" - " -d do NOT expand data (see -D, default)[null]\n" + " -b INT block size in MB when merging (default: 10)\n" " -s Hide all program messages [null]\n"; } @@ -58,9 +55,8 @@ int sort(int argc, char** argv){ {"output", required_argument, 0, 'o' }, {"memory", optional_argument, 0, 'L' }, {"threads", optional_argument, 0, 't' }, - {"expand", no_argument, 0, 'D' }, - {"no-expand", no_argument, 0, 'd' }, {"merge", no_argument, 0, 'M' }, + {"block-size", optional_argument, 0, 'b' }, {"silent", no_argument, 0, 's' }, {0,0,0,0} }; @@ -68,13 +64,13 @@ int sort(int argc, char** argv){ // Parameter defaults std::string input, output; double memory_limit = 100e6; + int block_size = 10e6; bool merge = false; - bool expand = false; int threads = std::thread::hardware_concurrency(); int c = 0; int long_index = 0; - while ((c = getopt_long(argc, argv, "i:o:L:t:dDMs", long_options, &long_index)) != -1){ + while ((c = getopt_long(argc, argv, "i:o:L:t:b:dDMs", long_options, &long_index)) != -1){ switch (c){ case ':': /* missing option argument */ fprintf(stderr, "%s: option `-%c' requires an argument\n", @@ -95,11 +91,18 @@ int sort(int argc, char** argv){ break; case 'L': memory_limit = atof(optarg) * 1e6; - if(memory_limit < 0){ + if(memory_limit <= 0){ std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Parameter L cannot be negative" << std::endl; return(1); } break; + case 'b': + block_size = atoi(optarg) * 1e6; + if(block_size <= 0){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Parameter b cannot be negative" << std::endl; + return(1); + } + break; case 't': threads = atoi(optarg); if(threads <= 0){ @@ -111,13 +114,6 @@ int sort(int argc, char** argv){ case 'M': merge = true; break; - - case 'D': - expand = true; - break; - case 'd': - expand = false; - break; } } @@ -138,9 +134,8 @@ int sort(int argc, char** argv){ std::cerr << Tomahawk::Helpers::timestamp("LOG") << "Calling sort..." << std::endl; } - Tomahawk::Algorithm::Output::TomahawkOutputSorter reader; + Tomahawk::Algorithm::OutputSorter reader; reader.n_threads = threads; - reader.reverse_entries = expand; if(!merge){ if(!reader.sort(input, output, memory_limit)){ @@ -148,7 +143,7 @@ int sort(int argc, char** argv){ return 1; } } else { - if(!reader.sortMerge(input, output, 10e6)){ + if(!reader.sortMerge(input, output, block_size)){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "SORT") << "Failed merge" << std::endl; return 1; } diff --git a/src/stats.h b/src/stats.h index 5d99c13..eb34914 100644 --- a/src/stats.h +++ b/src/stats.h @@ -22,11 +22,10 @@ DEALINGS IN THE SOFTWARE. */ #include +#include "tomahawk/two/output_filter.h" +#include "tomahawk/two/TomahawkOutputReader.h" #include "utility.h" -#include "totempole/TotempoleReader.h" #include "tomahawk/TomahawkReader.h" -#include "tomahawk/TomahawkOutput/TomahawkOutputFilterController.h" -#include "tomahawk/TomahawkOutput/TomahawkOutputReader.h" void stats_usage(void){ programMessage(); @@ -108,7 +107,7 @@ int stats(int argc, char** argv){ return(1); } else if(end == Tomahawk::Constants::OUTPUT_LD_SUFFIX){ - Tomahawk::IO::TomahawkOutputReader reader; + Tomahawk::TomahawkOutputReader reader; //reader.setWriteHeader(outputHeader); //Tomahawk::TomahawkOutputFilterController& filter = reader.getFilter(); //filter = Tomahawk::TomahawkOutputFilterController(two_filter); // use copy ctor to transfer data @@ -116,16 +115,16 @@ int stats(int argc, char** argv){ //if(!reader.setWriterType(outputType)) // return 1; - if(!reader.Open(input)) - return 1; + //if(!reader.Open(input)) + // return 1; //if(!reader.AddRegions(filter_regions)){ // std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Failed to add region!" << std::endl; // return 1; //} - if(!reader.summary(input, bins)) - return 1; + //if(!reader.summary(input, bins)) + // return 1; } else { std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Unrecognised input file format: " << input << std::endl; diff --git a/src/support/MagicConstants.h b/src/support/MagicConstants.h index ff6a73b..41abc4b 100644 --- a/src/support/MagicConstants.h +++ b/src/support/MagicConstants.h @@ -2,7 +2,8 @@ #define MAGICCONSTANTS_H_ #include -#include "../support/TypeDefinitions.h" + +#include "type_definitions.h" extern int SILENT; @@ -13,27 +14,18 @@ extern std::string LITERAL_COMMAND_LINE; extern std::string INTERPRETED_COMMAND; // Versioning -const float PROGRAM_VERSION = 0.2; // major -const U32 PROGRAM_VERSION_MINOR = 0; +const float PROGRAM_VERSION_MAJOR = 0.3; // major +const float PROGRAM_VERSION_MINOR = 0; const double ALLOWED_ROUNDING_ERROR = 0.001; const std::string PROGRAM_NAME = "tomahawk"; const std::string OUTPUT_SUFFIX = "twk"; -const std::string OUTPUT_INDEX_SUFFIX = "twi"; const std::string OUTPUT_LD_SUFFIX = "two"; -const std::string OUTPUT_LD_PARTIAL_SORT_INDEX_SUFFIX = "twsi"; -const std::string OUTPUT_LD_SORT_INDEX_SUFFIX = "toi"; // Headers const char* const WRITE_HEADER_MAGIC = "TOMAHAWK\1"; -const char* const WRITE_HEADER_INDEX_MAGIC = "TOTEMPOLE\1"; -const char* const WRITE_HEADER_LD_MAGIC = "TOMAHAWK~OUTPUT\1"; -const char* const WRITE_HEADER_LD_SORT_MAGIC = "TOMAHAWK~OUTPUT~INDEX\1"; const U16 WRITE_HEADER_MAGIC_LENGTH = 9; -const U16 WRITE_HEADER_MAGIC_INDEX_LENGTH = 10; -const U16 WRITE_HEADER_LD_MAGIC_LENGTH = 16; -const U16 WRITE_HEADER_LD_SORT_MAGIC_LENGTH = 22; const BYTE TOMAHAWK_ALLELE_PACK_WIDTH = 2; // bit / allele const BYTE TOMAHAWK_SNP_PACK_WIDTH = TOMAHAWK_ALLELE_PACK_WIDTH * 2; // bits / genotype @@ -52,7 +44,7 @@ const BYTE REF_ALT_N = 4; // Upper bounds // change to constants -const U32 UPPER_LIMIT_SAMPLES_8B = ((1 << (8 - TOMAHAWK_SNP_PACK_WIDTH)) - 1); // 00001111 = 2^4 - 1 +const U32 UPPER_LIMIT_SAMPLES_8B = ((1 << (8 - TOMAHAWK_SNP_PACK_WIDTH)) - 1); // 00001111 = 2^4 - 1 const U32 UPPER_LIMIT_SAMPLES_16B = ((1 << (16 - TOMAHAWK_SNP_PACK_WIDTH)) - 1); // 0000(1)12 = 2^12 - 1 const U32 UPPER_LIMIT_SAMPLES_32B = ((1 << (32 - TOMAHAWK_SNP_PACK_WIDTH)) - 1); // 0000(1)28 = 2^28 - 1 const U64 UPPER_LIMIT_SAMPLES_64B = (((U64)1 << (64 - TOMAHAWK_SNP_PACK_WIDTH)) - 1); // 0000(1)60 = 2^60 - 1 @@ -62,13 +54,9 @@ const U16 SAMPLES_16B_MASK = 4095; const U32 SAMPLES_32B_MASK = 268435455; const U64 SAMPLES_64B_MASK = 1152921504606846976; -// EOF -//const char* const TOMAHAWK_EOF_MARKER = "We will be known forever by the tracks we leave" - Santee Sioux Native Americans from Dakota; -//const U32 TOMAHAWK_EOF_MARKER_LENGTH = 31; - -const BYTE eof_length = 6; -const U64 eof[6] = {2336361506924422487, 7959953386435011938, 8243124871055238688, 2334386829831791136, 8583987794834190964, 28464622577219173}; -// EOF poem: "We will be known forever by the tracks we leave" +const BYTE eof_length = 64; +// EOF poem: "We will be known forever by the tracks we leave" - Santee Sioux Native Americans from Dakota; +const std::string eof_hex = "f3da5a14f8462d0e067eea643111437b3c033b61372ab0d55a45b5b1668f18db6a29d4c87b0c3ecdcaea374d936a406c248c851fe215c2c0669e2cfcd9f734a4"; } } diff --git a/src/support/helpers.cpp b/src/support/helpers.cpp index 84aa77f..706c24e 100644 --- a/src/support/helpers.cpp +++ b/src/support/helpers.cpp @@ -6,10 +6,10 @@ #include #include -#include "TypeDefinitions.h" #include "helpers.h" #include "simd_definitions.h" #include "MagicConstants.h" +#include "type_definitions.h" namespace Tomahawk{ namespace Helpers{ @@ -76,7 +76,7 @@ std::string datetime(){ struct tm *now = localtime(&t); gettimeofday(&tv, &tz); - char buffer[23]; + char buffer[128]; sprintf(buffer, "%04u-%02u-%02u %02u:%02u:%02u,%03u", now->tm_year + 1900, now->tm_mon + 1, @@ -223,5 +223,25 @@ bool parsePositionalStringTWO(const std::string& param){ return(matchPositionalStringTWO(param)); } +S32 char2int(const char& input){ + if(input >= '0' && input <= '9') return input - '0'; + else if(input >= 'A' && input <= 'F') return input - 'A' + 10; + else if(input >= 'a' && input <= 'f') return input - 'a' + 10; + throw std::invalid_argument("Invalid input string"); +} + +bool HexToBytes(const std::string& hex, uint8_t* target){ + if(hex.size() % 2 != 0){ + std::cerr << "illegal uneven hex" << std::endl; + return false; + } + + U32 p = 0; + for (U32 i = 0; i < hex.length(); i += 2, ++p) + target[p] = char2int(hex[i])*16 + char2int(hex[i+1]); + + return true; +} + } } diff --git a/src/support/helpers.h b/src/support/helpers.h index cc7e863..9a488c4 100644 --- a/src/support/helpers.h +++ b/src/support/helpers.h @@ -10,7 +10,7 @@ #include #include -#include "TypeDefinitions.h" +#include "type_definitions.h" namespace Tomahawk{ namespace Helpers{ @@ -83,6 +83,8 @@ inline std::string secondsToTimestring(const double& value){ return(retVal); } +S32 char2int(const char& input); +bool HexToBytes(const std::string& hex, uint8_t* target); } } diff --git a/src/support/simd_definitions.h b/src/support/simd_definitions.h index a7f8c1f..62a3f5f 100644 --- a/src/support/simd_definitions.h +++ b/src/support/simd_definitions.h @@ -1,6 +1,8 @@ #ifndef SIMD_H_ #define SIMD_H_ +#include + #if defined(_MSC_VER) /* Microsoft C/C++-compatible compiler */ #include diff --git a/src/support/TypeDefinitions.h b/src/support/type_definitions.h similarity index 100% rename from src/support/TypeDefinitions.h rename to src/support/type_definitions.h diff --git a/src/tomahawk/TomahawkBlockManager.h b/src/tomahawk/TomahawkBlockManager.h deleted file mode 100644 index cfbf3d5..0000000 --- a/src/tomahawk/TomahawkBlockManager.h +++ /dev/null @@ -1,367 +0,0 @@ -#ifndef TOMAHAWK_TomahawkBlockManager_H_ -#define TOMAHAWK_TomahawkBlockManager_H_ - -#include "../support/simd_definitions.h" -#include "../algorithm/GenotypeBitPacker.h" -#include "base/TomahawkSupport.h" -#include "../tomahawk/base/TomahawkEntryMeta.h" -#include "../totempole/TotempoleEntry.h" -#include "../totempole/TotempoleReader.h" - -namespace Tomahawk{ - -template > -struct TomahawkBlock; // forward declare: required for build function - -template -struct TomahawkBlockPackedPair{ -public: - TomahawkBlockPackedPair(const U32 size): - frontZero(0), - tailZero(0), - frontZeroMissing(0), - tailZeroMissing(0), - #if SIMD_AVAILABLE == 1 - data((BYTE*)_mm_malloc(size, T)), - mask((BYTE*)_mm_malloc(size, T)) - #else - data(new BYTE[size]), - mask(new BYTE[size]) - #endif - { - memset(this->data, 0, size); - memset(this->mask, 0, size); - } - - ~TomahawkBlockPackedPair(){ - #if SIMD_AVAILABLE == 1 - _mm_free(this->data); - _mm_free(this->mask); - #else - delete [] this->data; - delete [] this->mask; - #endif - } - -public: - U32 frontZero; // leading zeros in aligned vector width - U32 tailZero; // trailing zeros in aligned vector width - U32 frontZeroMissing; // number of missing values in leading zeros - U32 tailZeroMissing; // number of missing values in trailing zeros - BYTE* data; - BYTE* mask; -} __attribute__((aligned(16))); - -class TomahawkBlockPacked{ - typedef TomahawkBlockPackedPair<> pair_type; - -public: - TomahawkBlockPacked() : width(0), data(nullptr){} - ~TomahawkBlockPacked(){ - delete [] this->data; - delete this->data; - } - - // copy constructor - TomahawkBlockPacked(const TomahawkBlockPacked& other) : - width(other.width), - data(other.data) - { - - } - - // move constructor - TomahawkBlockPacked(TomahawkBlockPacked&& other) noexcept : - width(other.width), - data(other.data) - { - other.data = nullptr; - } - - /** Move assignment operator */ - TomahawkBlockPacked& operator=(TomahawkBlockPacked&& other) noexcept{ - // prevent self-move - if(this != &other) - this->width = other.width; - - return *this; - } - - template - bool Build(TomahawkBlock& controller, const U64& samples); - - inline const pair_type& getData(const U32 p) const{ return(*this->data[p]); } - -public: - U32 width; - pair_type** data; -}; - - -template -struct TomahawkBlock{ - typedef Y type; - typedef type value_type; - typedef type *pointer; - typedef const type *const_pointer; - typedef type &reference; - typedef const type &const_reference; - typedef size_t size_type; - typedef ptrdiff_t difference_type; - - typedef TomahawkEntryMeta meta_type; - -public: - TomahawkBlock(const char* target, const Totempole::TotempoleEntry& support) : - metaPointer(0), - runsPointer(0), - support(&support), - meta(reinterpret_cast* const>(target)), - runs(reinterpret_cast(&target[(TOMAHAWK_ENTRY_META_SIZE + sizeof(T)) * support.variants])), - packed(new TomahawkBlockPacked) - { - - } - - ~TomahawkBlock() noexcept{} - - // copy constructor - TomahawkBlock(const TomahawkBlock& other) : - metaPointer(other.metaPointer), - runsPointer(other.runsPointer), - support(other.support), - meta(other.meta), - runs(other.runs), - packed(other.packed) - { - - } - - void operator=(const TomahawkBlock& other){ - this->metaPointer = other.metaPointer; - this->runsPointer = other.runsPointer; - } - - // move constructor - TomahawkBlock(TomahawkBlock&& other) noexcept : - metaPointer(other.metaPointer), - runsPointer(other.runsPointer), - support(other.support), - meta(other.meta), - runs(other.runs), - packed(other.packed) - { - - } - - inline void updatePacked(const TomahawkBlock& self){ - this->packed = new TomahawkBlockPacked(self.packed); - } - - - inline void operator++(void){ - this->runsPointer += this->meta[this->metaPointer].runs; - ++this->metaPointer; - } - - inline void operator--(void){ - --this->metaPointer; - this->runsPointer -= this->meta[this->metaPointer].runs; - } - - inline const meta_type& currentMeta(void) const{ return(this->meta[this->metaPointer]); } - inline const_reference operator[](const U32 p) const{ return this->runs[this->runsPointer + p]; } - - const U16& size(void) const{ return this->support->variants; } - void reset(void){ - this->metaPointer = 0; - this->runsPointer = 0; - } - - void WriteVariant(const Totempole::TotempoleReader& totempole, IO::BasicBuffer& buffer, bool dropGenotypes = false) const{ - // All genotypes in this line will have the same phase - const char separator = this->currentMeta().phased == 1 ? '|' : '/'; - - // Note: - // Much faster to first write to a char buffer then flush - // instead of keep writing to cout (even without manual flushing) - buffer += totempole.getContig(this->support->contigID).name; - buffer += '\t'; - buffer += std::to_string(this->currentMeta().position); - buffer += '\t'; - buffer += '.'; - buffer += '\t'; - buffer += Constants::REF_ALT_LOOKUP[this->currentMeta().ref_alt >> 4]; - buffer += '\t'; - buffer += Constants::REF_ALT_LOOKUP[this->currentMeta().ref_alt & ((1 << 4) - 1)]; - buffer += '\t'; - buffer += Constants::QUAL; - buffer += '\t'; - buffer += Constants::PASS; - buffer += '\t'; - buffer += std::string("HWE_P="); - buffer += std::to_string(this->currentMeta().HWE_P); - buffer += std::string(";MAF="); - buffer += std::to_string(this->currentMeta().MAF); - - if(!dropGenotypes){ - buffer += '\t'; - buffer += Constants::GT; - buffer += '\t'; - - // For each run length encoded entry - for(U32 i = 0; i < this->currentMeta().runs - 1; ++i){ - const char& left = Constants::TOMAHAWK_ALLELE_LOOKUP_REVERSE[(*this)[i].alleleA]; - const char& right = Constants::TOMAHAWK_ALLELE_LOOKUP_REVERSE[(*this)[i].alleleB]; - - // Repeat genotype run-length times - for(U32 k = 0; k < (*this)[i].runs; ++k){ - buffer += left; - buffer += separator; - buffer += right; - buffer += '\t'; - } - } - - // For the last run length encoded entry - const char& left = Constants::TOMAHAWK_ALLELE_LOOKUP_REVERSE[(*this)[this->currentMeta().runs - 1].alleleA]; - const char& right = Constants::TOMAHAWK_ALLELE_LOOKUP_REVERSE[(*this)[this->currentMeta().runs - 1].alleleB]; - - // Repeat genotype run-length - 1 times - // Do not put a tab delimiter last - for(U32 k = 0; k < (*this)[this->currentMeta().runs - 1].runs - 1; ++k){ - buffer += left; - buffer += separator; - buffer += right; - buffer += '\t'; - } - // Place a new line in the end instead - buffer += left; - buffer += separator; - buffer += right; - buffer += '\n'; - } else { - buffer += '\n'; - } - } - - bool buildPacked(const U64& samples); - void clearPacked(void){ delete this->packed; } - -public: - U32 metaPointer; - U32 runsPointer; - const Totempole::TotempoleEntry* const support; // parent Totempole information - const meta_type* const meta; - const type* const runs; - TomahawkBlockPacked* packed; -}; - -template -bool TomahawkBlockPacked::Build(TomahawkBlock& controller, const U64& samples){ - if(controller.support->variants == 0) - return false; - - controller.reset(); - TomahawkBlock>& c = *reinterpret_cast>*>(&controller); - - this->width = c.support->variants; - this->data = new pair_type*[c.support->variants]; - - const U32 byte_width = ceil((double)samples/4); - - // INVERSE mask is cheaper in terms of instructions used - // exploited in calculations: TomahawkCalculationSlave - const BYTE lookup_mask[16] = {0, 0, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; - const BYTE lookup_data[16] = {0, 1, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - - for(U32 i = 0; i < c.support->variants; ++i){ - this->data[i] = new pair_type(byte_width); - Algorithm::GenotypeBitPacker packerA(this->data[i]->data, 2); - Algorithm::GenotypeBitPacker packerB(this->data[i]->mask, 2); - - for(U32 j = 0; j < c.meta[i].runs; ++j){ - packerA.add(lookup_data[c[j].alleles], c[j].runs); - packerB.add(lookup_mask[c[j].alleles], c[j].runs); - } - ++c; - } - controller.reset(); - - const U32 byteAlignedEnd = byte_width/(GENOTYPE_TRIP_COUNT/4)*(GENOTYPE_TRIP_COUNT/4); - - // Search for zero runs in either end - for(U32 i = 0; i < c.support->variants; ++i){ - S32 j = 0; - - // Search from left->right - for(; j < byteAlignedEnd; ++j){ - if(this->data[i]->data[j] != 0 || this->data[i]->mask[j] != 0) - break; - } - - // Front of zeroes - this->data[i]->frontZero = ((j - 1 < 0 ? 0 : j - 1)*4)/GENOTYPE_TRIP_COUNT; - if(j == byteAlignedEnd) - break; - - j = byteAlignedEnd - 1; - for(; j > 0; --j){ - if(this->data[i]->data[j] != 0 || this->data[i]->mask[j] != 0) - break; - } - - // Tail of zeroes - this->data[i]->tailZero = ((byteAlignedEnd - (j+1))*4)/GENOTYPE_TRIP_COUNT; - } - return true; -} - -template -bool TomahawkBlock::buildPacked(const U64& samples){ - return(this->packed->Build(*this, samples)); -} - - -template -class TomahawkBlockManager{ - typedef TomahawkBlockManager self_type; - typedef TomahawkBlock controller_type; - typedef Totempole::TotempoleEntry totempole_entry_type; - -public: - TomahawkBlockManager(const Totempole::TotempoleReader& header) : - header(header) - {} - ~TomahawkBlockManager(){} - - controller_type operator[](const U32 p) const{ return(controller_type(this->blocks[p])); } // copy constructor return - void Add(const char* data, const totempole_entry_type& entry){ this->blocks.push_back(controller_type(data, entry)); } - bool BuildVectorized(void){ - for(U32 i = 0; i < this->blocks.size(); ++i) - this->blocks[i].buildPacked(header.getSamples()); - - return true; - } - - inline size_t size(void) const{ return this->blocks.size(); } - U32 getVariants(void) const{ - U32 variants = 0; - - for(U32 i = 0; i < this->size(); ++i) - variants += this->blocks[i].size(); - - return variants; - } - -public: - std::vector blocks; - - // Header - const Totempole::TotempoleReader& header; -}; - - -} - -#endif /* TOMAHAWK_TomahawkBlockManager_H_ */ diff --git a/src/tomahawk/TomahawkCalc.cpp b/src/tomahawk/TomahawkCalc.cpp index 51840cb..961a4d7 100644 --- a/src/tomahawk/TomahawkCalc.cpp +++ b/src/tomahawk/TomahawkCalc.cpp @@ -10,7 +10,7 @@ TomahawkCalc::TomahawkCalc(void) : TomahawkCalc::~TomahawkCalc(){} bool TomahawkCalc::Open(const std::string input, const std::string output){ - if(!this->reader.Open(input)){ + if(!this->reader.open(input)){ return false; } @@ -77,7 +77,7 @@ bool TomahawkCalc::Calculate(){ this->balancer.setSelected(this->parameters.chunk_selected); this->balancer.setDesired(this->parameters.n_chunks); - if(!this->balancer.Build(this->reader.getTotempole().getBlocks(), this->parameters.n_threads)){ + if(!this->balancer.Build(this->reader.getIndex().getContainer().size(), this->parameters.n_threads)){ std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Failed to split into blocks..." << std::endl; return false; } diff --git a/src/tomahawk/TomahawkCalc.h b/src/tomahawk/TomahawkCalc.h index 92c1bfb..01f038f 100644 --- a/src/tomahawk/TomahawkCalc.h +++ b/src/tomahawk/TomahawkCalc.h @@ -1,21 +1,23 @@ #ifndef TOMAHAWK_TOMAHAWKCALC_H_ #define TOMAHAWK_TOMAHAWKCALC_H_ -#include "../totempole/TotempoleReader.h" #include "TomahawkReader.h" -#include "TomahawkOutput/TomahawkOutputManager.h" +#include "twk_reader_implementation.h" +#include "genotype_meta_container_reference.h" +#include "../io/output_writer.h" +#include "../index/index.h" namespace Tomahawk { class TomahawkCalc{ - typedef TomahawkCalc self_type; - typedef TomahawkCalcParameters parameter_type; - typedef std::pair pair_type; - typedef std::vector pair_vector; - typedef Balancer balancer_type; - typedef Totempole::TotempoleReader totempole_reader; - typedef Interface::ProgressBar progress_type; - typedef TomahawkReader reader_type; + typedef TomahawkCalc self_type; + typedef TomahawkCalcParameters parameter_type; + typedef std::pair pair_type; + typedef std::vector pair_vector; + typedef LoadBalancerLD balancer_type; + typedef TomahawkHeader header_type; + typedef Interface::ProgressBar progress_type; + typedef TomahawkReader reader_type; public: TomahawkCalc(); @@ -31,59 +33,75 @@ class TomahawkCalc{ template bool Calculate(); private: - std::string input_file; - std::string output_file; - - bool parameters_validated; - progress_type progress; - balancer_type balancer; + std::string input_file; + std::string output_file; + bool parameters_validated; + progress_type progress; + balancer_type balancer; parameter_type parameters; - reader_type reader; + reader_type reader; }; template bool TomahawkCalc::Calculate(){ // Retrieve reference to Totempole reader - totempole_reader& totempole = this->reader.getTotempole(); - totempole.addLiteral("\n##tomahawk_calcCommand=" + Helpers::program_string()); - totempole.addLiteral("\n##tomahawk_calcInterpretedCommand=" + this->parameters.getInterpretedString()); + header_type& header = this->reader.getHeader(); + header.addLiteral("\n##tomahawk_calcCommand=" + Helpers::program_string()); + header.addLiteral("\n##tomahawk_calcInterpretedCommand=" + this->parameters.getInterpretedString()); - IO::TomahawkOutputManager writer; - if(!writer.Open(this->output_file, totempole)){ + IO::OutputWriter writer; + if(!writer.open(this->output_file)){ std::cerr << Helpers::timestamp("ERROR", "TWI") << "Failed to open..." << std::endl; return false; } - // Construct Tomahawk manager - TomahawkBlockManager controller(totempole); - for(U32 i = 0; i < this->reader.DataOffsetSize(); ++i) - controller.Add(this->reader.getOffsetPair(i).data, this->reader.getOffsetPair(i).entry); + writer.writeHeaders(this->reader.getHeader()); if(!SILENT){ -#if SIMD_AVAILABLE == 1 + #if SIMD_AVAILABLE == 1 std::cerr << Helpers::timestamp("LOG","SIMD") << "Vectorized instructions available: " << SIMD_MAPPING[SIMD_VERSION] << "..." << std::endl; -#else + #else std::cerr << Helpers::timestamp("LOG","SIMD") << "No vectorized instructions available..." << std::endl; -#endif + #endif std::cerr << Helpers::timestamp("LOG","SIMD") << "Building 1-bit representation: "; } - // Build 1-bit representation from RLE data - if(!controller.BuildVectorized()){ - std::cerr << Helpers::timestamp("ERROR", "SIMD") << "Failed building bit-representation..." << std::endl; - return false; + // Construct Tomahawk manager + //TomahawkReaderImpl impl(totempole.header.samples, this->reader.DataOffsetSize()+1); + + //std::cerr << "not implemented" << std::endl; + //exit(1); + + GenotypeMetaContainerReference references(header.magic_.getNumberSamples(), this->reader.DataOffsetSize()+1); + + U64 n_variants = 0; + for(U32 i = 0; i < this->reader.DataOffsetSize(); ++i){ + // Hard copy of data into STL-like containers + /* + impl.addDataBlock(this->reader.getOffsetPair(i).data, + this->reader.getOffsetPair(i).l_buffer, + this->reader.getOffsetPair(i).entry); + */ + + // Reference interpretation of char buffer in psuedo-iterator containers + // directly from unaligned memory + references.addDataBlock(this->reader.getOffsetPair(i).data, + this->reader.getOffsetPair(i).l_buffer, + this->reader.getOffsetPair(i).entry); + + n_variants += references[i].getTotempole().size(); } if(!SILENT) std::cerr << "Done..." << std::endl; // Number of variants in memory - const U64 variants = controller.getVariants(); + const U64 variants = references.countVariants(); if(!SILENT) std::cerr << Helpers::timestamp("LOG","CALC") << "Total " << Helpers::ToPrettyString(variants) << " variants..." << std::endl; - // Todo: validate + // Todo: validate & decouple U64 totalComparisons = 0; for(U32 i = 0; i < this->balancer.thread_distribution.size(); ++i){ for(U32 j = 0; j < this->balancer.thread_distribution[i].size(); ++j){ @@ -93,11 +111,13 @@ bool TomahawkCalc::Calculate(){ for(U32 col = from; col < this->balancer.thread_distribution[i][j].toColumn; ++col){ //std::cerr << '\t' << from << ":" << col << '\t'; if(from == col){ - const U32 size = controller[from].size(); + //const U32 size = impl[from].size(); + const U32 size = this->reader.getOffsetPair(from).entry.size(); totalComparisons += (size*size - size)/2; //std::cerr << (size*size - size)/2 << std::endl; } else { - totalComparisons += controller[from].size() * controller[col].size(); + //totalComparisons += impl[from].size() * impl[col].size(); + totalComparisons += this->reader.getOffsetPair(from).entry.size() * this->reader.getOffsetPair(col).entry.size(); //std::cerr << controller[from].size() * controller[col].size() << std::endl; } } @@ -107,11 +127,13 @@ bool TomahawkCalc::Calculate(){ for(U32 col = this->balancer.thread_distribution[i][j].fromColumn; col < this->balancer.thread_distribution[i][j].toColumn; ++col){ //std::cerr << '\t' << from << ":" << col << '\t'; if(from == col){ - const U32 size = controller[from].size(); + //const U32 size = impl[from].size(); + const U32 size = this->reader.getOffsetPair(from).entry.size(); totalComparisons += (size*size - size)/2; //std::cerr << (size*size - size)/2 << std::endl; } else { - totalComparisons += controller[from].size() * controller[col].size(); + //totalComparisons += impl[from].size() * impl[col].size(); + totalComparisons += this->reader.getOffsetPair(from).entry.size() * this->reader.getOffsetPair(col).entry.size(); //std::cerr << controller[from].size() * controller[col].size() << std::endl; } } @@ -122,14 +144,14 @@ bool TomahawkCalc::Calculate(){ // Update progress bar with data this->progress.SetComparisons(totalComparisons); - this->progress.SetSamples(totempole.getSamples()); + this->progress.SetSamples(header.magic_.getNumberSamples()); this->progress.SetDetailed(this->parameters.detailed_progress); if(!SILENT) std::cerr << Helpers::timestamp("LOG","CALC") << "Performing " << Helpers::ToPrettyString(totalComparisons) << " variant comparisons..."<< std::endl; // Setup slaves - TomahawkCalculateSlave** slaves = new TomahawkCalculateSlave*[this->parameters.n_threads]; + LDSlave** slaves = new LDSlave*[this->parameters.n_threads]; std::vector thread_pool; // Setup workers @@ -139,7 +161,7 @@ bool TomahawkCalc::Calculate(){ } for(U32 i = 0; i < this->parameters.n_threads; ++i){ - slaves[i] = new TomahawkCalculateSlave(controller, writer, this->progress, this->parameters, this->balancer.thread_distribution[i]); + slaves[i] = new LDSlave(references, writer, this->progress, this->parameters, this->balancer.thread_distribution[i]); if(!SILENT) std::cerr << '.'; } @@ -177,23 +199,28 @@ bool TomahawkCalc::Calculate(){ for(U32 i = 1; i < this->parameters.n_threads; ++i) *slaves[0] += *slaves[i]; - writer = slaves[0]->getOutputManager().getTotempoleBlocks(); + writer = slaves[0]->getWriter(); if(!SILENT){ - std::cerr << Helpers::timestamp("LOG") << "Throughput: " << timer.ElapsedString() << " (" << Helpers::ToPrettyString((U64)ceil((double)slaves[0]->getComparisons()/timer.Elapsed().count())) << " pairs of SNP/s, " << Helpers::ToPrettyString((U64)ceil((double)slaves[0]->getComparisons()*totempole.getSamples()/timer.Elapsed().count())) << " genotypes/s)..." << std::endl; - std::cerr << Helpers::timestamp("LOG") << "Comparisons: " << Helpers::ToPrettyString(slaves[0]->getComparisons()) << " pairwise SNPs and " << Helpers::ToPrettyString(slaves[0]->getComparisons()*totempole.getSamples()) << " pairwise genotypes. Output " << Helpers::ToPrettyString(this->progress.GetOutputCounter()) << "..." << std::endl; + std::cerr << Helpers::timestamp("LOG") << "Throughput: " << timer.ElapsedString() << " (" << Helpers::ToPrettyString((U64)ceil((double)slaves[0]->getComparisons()/timer.Elapsed().count())) << " pairs of SNP/s, " << Helpers::ToPrettyString((U64)ceil((double)slaves[0]->getComparisons()*header.magic_.getNumberSamples()/timer.Elapsed().count())) << " genotypes/s)..." << std::endl; + std::cerr << Helpers::timestamp("LOG") << "Comparisons: " << Helpers::ToPrettyString(slaves[0]->getComparisons()) << " pairwise SNPs and " << Helpers::ToPrettyString(slaves[0]->getComparisons()*header.magic_.getNumberSamples()) << " pairwise genotypes..." << std::endl; + std::cerr << Helpers::timestamp("LOG") << "Output: " << Helpers::ToPrettyString(writer.sizeEntries()) << " entries into " << Helpers::ToPrettyString(writer.sizeBlocks()) << " blocks..." << std::endl; } // Cleanup delete [] slaves; // Flush writer - writer.flushBlock(); + writer.flush(); + writer.writeFinal(); + /* if(!writer.finalise()){ std::cerr << Helpers::timestamp("ERROR", "INDEX") << "Failed to finalize..." << std::endl; return false; } + writer.close(); + */ return true; } diff --git a/src/tomahawk/TomahawkCalcParameters.cpp b/src/tomahawk/TomahawkCalcParameters.cpp deleted file mode 100644 index cd951a2..0000000 --- a/src/tomahawk/TomahawkCalcParameters.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef TOMAHAWK_TOMAHAWKCALCPARAMETERS_CPP_ -#define TOMAHAWK_TOMAHAWKCALCPARAMETERS_CPP_ - -#include "TomahawkCalcParameters.h" - -namespace Tomahawk{ - - - -} - -#endif /* TOMAHAWK_TOMAHAWKCALCPARAMETERS_CPP_ */ diff --git a/src/tomahawk/TomahawkCalcParameters.h b/src/tomahawk/TomahawkCalcParameters.h index c481c4d..eff5f92 100644 --- a/src/tomahawk/TomahawkCalcParameters.h +++ b/src/tomahawk/TomahawkCalcParameters.h @@ -6,17 +6,19 @@ namespace Tomahawk{ -#define CALC_DEFAULT_MINR2 0.1 -#define CALC_DEFAULT_MAXR2 1 -#define CALC_DEFAULT_MINP 1e-4 +#define CALC_DEFAULT_MINR2 0.1 +#define CALC_DEFAULT_MAXR2 1.0 +#define CALC_DEFAULT_MINP 1e-4 #define CALC_DEFAULT_MINALLELES 5 #define CALC_DEFAULT_MAXALLELES std::numeric_limits::max() struct TomahawkCalcParameters{ - typedef TomahawkCalcParameters self_type; +public: + typedef TomahawkCalcParameters self_type; typedef IO::GenericWriterInterace writer_type; enum force_method {none, phasedFunction, unphasedFunction}; +public: TomahawkCalcParameters() : n_threads(std::thread::hardware_concurrency() > 0 ? std::thread::hardware_concurrency() : 1), n_chunks(1), @@ -123,17 +125,18 @@ struct TomahawkCalcParameters{ return(os); } - S32 n_threads; - S32 n_chunks; - S32 chunk_selected; - double R2_min; - double R2_max; - double P_threshold; +public: + S32 n_threads; + S32 n_chunks; + S32 chunk_selected; + double R2_min; + double R2_max; + double P_threshold; int64_t minimum_alleles; int64_t maximum_alleles; writer_type::compression compression_type; force_method force; - bool detailed_progress; + bool detailed_progress; }; } diff --git a/src/tomahawk/TomahawkImportWriter.cpp b/src/tomahawk/TomahawkImportWriter.cpp deleted file mode 100644 index b231d5e..0000000 --- a/src/tomahawk/TomahawkImportWriter.cpp +++ /dev/null @@ -1,311 +0,0 @@ -#include "TomahawkImportWriter.h" - -namespace Tomahawk { - -TomahawkImportWriter::TomahawkImportWriter(const filter_type& filter) : - flush_limit(1000000), - n_variants_limit(1024), - blocksWritten_(0), - variants_written_(0), - largest_uncompressed_block_(0), - filter(filter), - rleController_(nullptr), - buffer_rle_(flush_limit*2), - buffer_meta_(flush_limit*2), - vcf_header_(nullptr) -{} - -TomahawkImportWriter::~TomahawkImportWriter(){ - delete this->rleController_; - this->buffer_rle_.deleteAll(); - this->buffer_meta_.deleteAll(); -} - -bool TomahawkImportWriter::Open(const std::string output){ - this->filename = output; - this->CheckOutputNames(output); - this->streamTomahawk.open(this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX, std::ios::out | std::ios::binary); - this->streamTotempole.open(this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX + '.' + Constants::OUTPUT_INDEX_SUFFIX, std::ios::out | std::ios::binary); - - // Check streams - if(!this->streamTomahawk.good()){ - std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not open: " << this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX << "!" << std::endl; - return false; - } - if(!this->streamTotempole.good()){ - std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not open: " << this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX + '.' + Constants::OUTPUT_INDEX_SUFFIX << "!" << std::endl; - return false; - } - - if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "WRITER") << "Opening: " << this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX << "..." << std::endl; - std::cerr << Helpers::timestamp("LOG", "WRITER") << "Opening: " << this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX + '.' + Constants::OUTPUT_INDEX_SUFFIX << "..." << std::endl; - } - - // Write Tomahawk and Totempole headers - this->WriteHeaders(); - - // Determine flush limit - this->DetermineFlushLimit(); - - return true; -} - -void TomahawkImportWriter::DetermineFlushLimit(void){ - this->flush_limit = this->vcf_header_->samples * this->n_variants_limit / 10; // Worst case - if(this->vcf_header_->samples <= Constants::UPPER_LIMIT_SAMPLES_8B - 1) - this->flush_limit *= sizeof(BYTE); - else if(this->vcf_header_->samples <= Constants::UPPER_LIMIT_SAMPLES_16B - 1) - this->flush_limit *= sizeof(U16); - else if(this->vcf_header_->samples <= Constants::UPPER_LIMIT_SAMPLES_32B - 1) - this->flush_limit *= sizeof(U32); - else this->flush_limit *= sizeof(U64); -} - -bool TomahawkImportWriter::OpenExtend(const std::string output){ - this->filename = output; - this->CheckOutputNames(output); - this->streamTomahawk.open(output, std::ios::in | std::ios::out | std::ios::binary | std::ios::ate); - this->streamTotempole.open(output + '.' + Constants::OUTPUT_INDEX_SUFFIX, std::ios::in | std::ios::out | std::ios::binary | std::ios::ate); - - // Check streams - if(!this->streamTomahawk.good()){ - std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not open: " << output << "!" << std::endl; - return false; - } - if(!this->streamTotempole.good()){ - std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not open: " << output + '.' + Constants::OUTPUT_INDEX_SUFFIX << "!" << std::endl; - return false; - } - - if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "WRITER") << "Extending: " << output << "..." << std::endl; - std::cerr << Helpers::timestamp("LOG", "WRITER") << "Extending: " << output + '.' + Constants::OUTPUT_INDEX_SUFFIX << "..." << std::endl; - } - - U64 tempsize = this->streamTomahawk.tellp(); - this->streamTomahawk.seekp(tempsize - sizeof(U64)*Tomahawk::Constants::eof_length); - tempsize = this->streamTotempole.tellp(); - this->streamTotempole.seekp(tempsize - sizeof(U64)*Tomahawk::Constants::eof_length); - - // Determine flush limit - this->DetermineFlushLimit(); - - return true; -} - -void TomahawkImportWriter::WriteHeaders(void){ - if(this->vcf_header_ == nullptr){ - std::cerr << Helpers::timestamp("ERROR", "INTERNAL") << "Header not set!" << std::endl; - exit(1); - } - - this->streamTotempole.write(Constants::WRITE_HEADER_INDEX_MAGIC, Constants::WRITE_HEADER_MAGIC_INDEX_LENGTH); - this->streamTomahawk.write(Constants::WRITE_HEADER_MAGIC, Constants::WRITE_HEADER_MAGIC_LENGTH); - - const U64& samples = this->vcf_header_->samples; - Totempole::TotempoleHeader h(samples); - this->streamTotempole << h; - Totempole::TotempoleHeaderBase* hB = reinterpret_cast(&h); - this->streamTomahawk << *hB; - - // Write out dummy variable for IO offset - U32 nothing = 0; // Dummy variable - size_t posOffset = this->streamTotempole.tellp(); // remember current IO position - this->streamTotempole.write(reinterpret_cast(¬hing), sizeof(U32)); // data offset - - // Write the number of contigs - const U32 n_contigs = this->vcf_header_->contigs.size(); - this->streamTotempole.write(reinterpret_cast(&n_contigs), sizeof(U32)); - - // Write contig data to Totempole - // length | n_char | chars[0 .. n_char - 1] - for(U32 i = 0; i < this->vcf_header_->contigs.size(); ++i){ - Totempole::TotempoleContigBase contig(this->vcf_header_->contigs[i].length, - this->vcf_header_->contigs[i].name.size(), - this->vcf_header_->contigs[i].name); - - this->streamTotempole << contig; - } - - // Write sample names - // n_char | chars[0..n_char - 1] - for(U32 i = 0; i < samples; ++i){ - const U32 n_char = this->vcf_header_->sampleNames[i].size(); - this->streamTotempole.write(reinterpret_cast(&n_char), sizeof(U32)); - this->streamTotempole.write(reinterpret_cast(&this->vcf_header_->sampleNames[i][0]), n_char); - } - - // Push in VCF header and executed line - buffer_type temp(this->vcf_header_->literal_lines.size()*65536); - for(U32 i = 0; i < this->vcf_header_->literal_lines.size(); ++i) - temp += this->vcf_header_->literal_lines[i] + '\n'; - - const std::string command = "##tomahawk_importCommand=" + std::string(Constants::LITERAL_COMMAND_LINE) - + "; VERSION=" + std::string(VERSION) - + "; Date=" + Tomahawk::Helpers::datetime() + "; SIMD=" + SIMD_MAPPING[SIMD_VERSION]; - - temp += command; - this->gzip_controller_.Deflate(temp); - this->streamTotempole.write(&this->gzip_controller_.buffer.data[0], this->gzip_controller_.buffer.pointer); - this->gzip_controller_.Clear(); - temp.deleteAll(); - - U32 curPos = this->streamTotempole.tellp(); // remember current IO position - this->streamTotempole.seekp(posOffset); // seek to previous position - this->streamTotempole.write(reinterpret_cast(&curPos), sizeof(U32)); // overwrite data offset - this->streamTotempole.seekp(curPos); // seek back to current IO position -} - -void TomahawkImportWriter::WriteFinal(void){ - // Write EOF - for(U32 i = 0; i < Constants::eof_length; ++i){ - this->streamTotempole.write(reinterpret_cast(&Constants::eof[i]), sizeof(U64)); - this->streamTomahawk.write(reinterpret_cast(&Constants::eof[i]), sizeof(U64)); - } - - // Re-open file and overwrite block counts and offset - const U32 shift = Constants::WRITE_HEADER_MAGIC_INDEX_LENGTH + sizeof(float) + sizeof(U64) + sizeof(BYTE); - this->streamTotempole.flush(); - std::fstream streamTemp(this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX + '.' + Constants::OUTPUT_INDEX_SUFFIX, std::ios_base::binary | std::ios_base::out | std::ios_base::in); - - if(!streamTemp.good()){ - std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not re-open file!" << std::endl; - exit(1); - } - - streamTemp.seekg(shift); - streamTemp.write(reinterpret_cast(&this->blocksWritten_), sizeof(U32)); - streamTemp.write(reinterpret_cast(&this->largest_uncompressed_block_), sizeof(U32)); - streamTemp.flush(); - streamTemp.close(); -} - -void TomahawkImportWriter::setHeader(VCF::VCFHeader& header){ - this->vcf_header_ = &header; - this->rleController_ = new Algorithm::TomahawkImportRLE(header.samples); - this->rleController_->DetermineBitWidth(); -} - -bool TomahawkImportWriter::add(const VCF::VCFLine& line){ - const U32 meta_start_pos = this->buffer_meta_.pointer; - const U32 rle_start_pos = this->buffer_rle_.pointer; - if(!this->rleController_->RunLengthEncode(line, this->buffer_meta_, this->buffer_rle_)){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - return false; - } - - const U64 n_runs = (this->buffer_rle_.pointer - rle_start_pos)/this->rleController_->getBitWidth(); - const TomahawkEntryMetaBase& base_meta = *reinterpret_cast(&this->buffer_meta_[meta_start_pos]); - - if(n_runs == 1){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - //std::cerr << "singleton" << std::endl; - return false; - } - - if(base_meta.HWE_P < this->filter.HWE_P){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - //std::cerr << "HWE_P < " << this->filter.HWE_P << ": " << base_meta.HWE_P << '\t' << base_meta << std::endl; - return false; - } - - if(base_meta.MAF < this->filter.MAF){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - //std::cerr << "MAF < " << this->filter.MAF << ": " << base_meta.MAF << '\t' << base_meta << std::endl; - return false; - } - - if(this->totempole_entry.minPosition == 0) - this->totempole_entry.minPosition = line.position; - - this->totempole_entry.maxPosition = line.position; - ++this->totempole_entry; - - return true; -} - -bool TomahawkImportWriter::add(const BCF::BCFEntry& line){ - const U32 meta_start_pos = this->buffer_meta_.pointer; - const U32 rle_start_pos = this->buffer_rle_.pointer; - if(!this->rleController_->RunLengthEncode(line, this->buffer_meta_, this->buffer_rle_)){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - return false; - } - - const U64 n_runs = (this->buffer_rle_.pointer - rle_start_pos)/this->rleController_->getBitWidth(); - const TomahawkEntryMetaBase& base_meta = *reinterpret_cast(&this->buffer_meta_[meta_start_pos]); - - if(n_runs == 1){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - return false; - } - - if(base_meta.HWE_P < this->filter.HWE_P){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - //std::cerr << "HWE_P < " << this->filter.HWE_P << ": " << base_meta.HWE_P << std::endl; - return false; - } - - if(base_meta.MAF < this->filter.MAF){ - this->buffer_meta_.pointer = meta_start_pos; // reroll back - this->buffer_rle_.pointer = rle_start_pos; // reroll back - //std::cerr << "MAF < " << this->filter.MAF << ": " << base_meta.MAF << std::endl; - return false; - } - - if(this->totempole_entry.minPosition == 0) - this->totempole_entry.minPosition = line.body->POS + 1; - - this->totempole_entry.maxPosition = line.body->POS + 1; - ++this->totempole_entry; - - return true; -} - -// flush and write -bool TomahawkImportWriter::flush(void){ - if(this->buffer_meta_.size() == 0){ - //std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Cannot flush writer with 0 entries..." << std::endl; - return false; - } - - this->totempole_entry.byte_offset = this->streamTomahawk.tellp(); // IO offset in Tomahawk output - this->gzip_controller_.Deflate(this->buffer_meta_, this->buffer_rle_); // Deflate block - this->streamTomahawk << this->gzip_controller_; // Write tomahawk output - this->gzip_controller_.Clear(); // Clean up gzip controller - - // Keep track of largest block observed - if(this->buffer_meta_.size() > this->largest_uncompressed_block_) - this->largest_uncompressed_block_ = this->buffer_meta_.size(); - - this->totempole_entry.uncompressed_size = this->buffer_meta_.size(); // Store uncompressed size - this->totempole_entry.byte_offset_end = this->streamTomahawk.tellp(); // IO offset in Tomahawk output - this->streamTotempole << this->totempole_entry; // Write totempole output - ++this->blocksWritten_; // update number of blocks written - this->variants_written_ += this->totempole_entry.variants; // update number of variants written - - this->reset(); // reset buffers - return true; -} - -void TomahawkImportWriter::CheckOutputNames(const std::string& input){ - std::vector paths = Helpers::filePathBaseExtension(input); - this->basePath = paths[0]; - if(this->basePath.size() > 0) - this->basePath += '/'; - - if(paths[3].size() == Constants::OUTPUT_SUFFIX.size() && strncasecmp(&paths[3][0], &Constants::OUTPUT_SUFFIX[0], Constants::OUTPUT_SUFFIX.size()) == 0) - this->baseName = paths[2]; - else this->baseName = paths[1]; -} - - -} /* namespace Tomahawk */ diff --git a/src/tomahawk/TomahawkImporter.cpp b/src/tomahawk/TomahawkImporter.cpp index ddca97a..5072c49 100644 --- a/src/tomahawk/TomahawkImporter.cpp +++ b/src/tomahawk/TomahawkImporter.cpp @@ -3,20 +3,22 @@ #include #include "../io/reader.h" -#include "TomahawkImportWriter.h" +#include "import_writer.h" #include "TomahawkReader.h" namespace Tomahawk { -TomahawkImporter::TomahawkImporter(std::string inputFile, std::string outputPrefix) : +TomahawkImporter::TomahawkImporter(const std::string inputFile, const std::string outputPrefix) : block_flush_limit(65536), - inputFile(inputFile), - outputPrefix(outputPrefix), + input_file(inputFile), + output_prefix(outputPrefix), reader_(inputFile), writer_(this->filters), - header_(nullptr), + vcf_header_(nullptr), rle_controller(nullptr) -{} +{ + +} TomahawkImporter::~TomahawkImporter(){ delete this->rle_controller; @@ -24,91 +26,12 @@ TomahawkImporter::~TomahawkImporter(){ } bool TomahawkImporter::Extend(std::string extendFile){ - if(this->inputFile.size() == 0){ - std::cerr << Helpers::timestamp("ERROR","VCF") << "No input file provided..." << std::endl; - return false; - } - - if(extendFile.size() == 0){ - std::cerr << Helpers::timestamp("ERROR","VCF") << "No file to extend provided..." << std::endl; - return false; - } - - if(!this->reader_.open()){ - std::cerr << Helpers::timestamp("ERROR","VCF") << "Failed to open file..." << std::endl; - return false; - } - - TomahawkReader tReader; - if(!tReader.Open(extendFile)){ - std::cerr << Helpers::timestamp("ERROR","IMPORT") << "Failed to read file..." << std::endl; - return false; - } - - const Totempole::TotempoleReader& totempole = tReader.getTotempole(); - *this->header_ = totempole; // Convert data in totempole to VCF header - - // Parse lines - line_type line(totempole.getHeader().samples); - - // Spawn RLE controller - this->rle_controller = new rle_controller_type(this->header_->samples); - this->rle_controller->DetermineBitWidth(); - - this->reader_.clear(); - // seek reader until line does not start with '#' - std::string templine; - while(getline(this->reader_.stream_, templine)){ - if(templine[0] != '#') - break; - } - this->reader_.stream_.seekg((U64)this->reader_.stream_.tellg() - templine.size() - 1); - - this->sort_order_helper.previous_position = totempole.back().maxPosition; - this->sort_order_helper.prevcontigID = totempole.back().contigID; - - this->writer_.setHeader(*this->header_); - this->writer_.blocksWritten_ = totempole.getHeader().blocks; - this->writer_.largest_uncompressed_block_ = totempole.getHeader().largest_uncompressed; - if(!this->writer_.OpenExtend(extendFile)) - return false; - - // While there are lines - while(this->reader_.getLine()){ - // Parse them - if(!this->parseVCFLine(line)){ - return false; - } - } // end while there are vcf lines - - // This only happens if there are no valid entries in the file - if(this->sort_order_helper.contigID == nullptr){ - std::cerr << Helpers::timestamp("ERROR","IMPORT") << "Did not import any variants..." << std::endl; - return false; - } - - ++this->header_->getContig(*this->sort_order_helper.contigID); - this->writer_.flush(); - - this->writer_.WriteFinal(); - - if(this->writer_.GetVariantsWritten() == 0){ - std::cerr << Helpers::timestamp("ERROR","IMPORT") << "Did not import any variants..." << std::endl; - return false; - } - - if(!SILENT) - std::cerr << Helpers::timestamp("LOG", "WRITER") << "Wrote: " << Helpers::NumberThousandsSeparator(std::to_string(this->writer_.GetVariantsWritten())) - << " variants to " << Helpers::NumberThousandsSeparator(std::to_string(this->writer_.blocksWritten())) - << " blocks..." << std::endl; - // Garbage - this->header_->unsetBorrowedPointers(); return true; } bool TomahawkImporter::Build(){ - std::ifstream temp(this->inputFile, std::ios::binary | std::ios::in); + std::ifstream temp(this->input_file, std::ios::binary | std::ios::in); if(!temp.good()){ std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "Failed to open file..." << std::endl; return false; @@ -136,28 +59,28 @@ bool TomahawkImporter::Build(){ bool TomahawkImporter::BuildBCF(void){ bcf_reader_type reader; - if(!reader.open(this->inputFile)){ + if(!reader.open(this->input_file)){ std::cerr << Helpers::timestamp("ERROR", "BCF") << "Failed to open BCF file..." << std::endl; return false; } - this->header_ = &reader.header; - if(this->header_->samples == 0){ + this->vcf_header_ = &reader.header; + if(this->vcf_header_->samples == 0){ std::cerr << Helpers::timestamp("ERROR", "BCF") << "No samples detected in header..." << std::endl; return false; } - if(this->header_->samples == 1){ + if(this->vcf_header_->samples == 1){ std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "Cannot run " << Tomahawk::Constants::PROGRAM_NAME << " with a single sample..." << std::endl; return false; } // Spawn RLE controller - this->rle_controller = new rle_controller_type(this->header_->samples); + this->rle_controller = new rle_controller_type(this->vcf_header_->samples); this->rle_controller->DetermineBitWidth(); this->writer_.setHeader(reader.header); - if(!this->writer_.Open(this->outputPrefix)){ + if(!this->writer_.Open(this->output_prefix)){ std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Failed to open writer..." << std::endl; return false; } @@ -179,11 +102,11 @@ bool TomahawkImporter::BuildBCF(void){ entry.reset(); S32 contigID = entry.body->CHROM; - this->sort_order_helper.previous_position = entry.body->POS; - this->sort_order_helper.contigID = &contigID; - this->sort_order_helper.prevcontigID = contigID; - this->writer_.totempole_entry.contigID = contigID; - this->writer_.totempole_entry.minPosition = entry.body->POS; + this->sort_order_helper.previous_position = entry.body->POS; + this->sort_order_helper.contigID = &contigID; + this->sort_order_helper.prevcontigID = contigID; + this->writer_.totempole_entry.contigID = contigID; + this->writer_.totempole_entry.min_position = entry.body->POS; if(!this->parseBCFLine(entry)){ std::cerr << Helpers::timestamp("ERROR", "BCF") << "Failed to parse BCF entry..." << std::endl; @@ -212,11 +135,12 @@ bool TomahawkImporter::BuildBCF(void){ return false; } - ++this->header_->getContig(*this->sort_order_helper.contigID); + ++this->vcf_header_->getContig(*this->sort_order_helper.contigID); this->writer_.flush(); - // return false; - - this->writer_.WriteFinal(); + // Update container with this totempole entry + this->index += this->writer_.totempole_entry; + this->index.buildMetaIndex(this->vcf_header_->contigs.size()); + this->writer_.WriteFinal(this->index, this->footer_); if(this->writer_.GetVariantsWritten() == 0){ std::cerr << Helpers::timestamp("ERROR","IMPORT") << "Did not import any variants..." << std::endl; @@ -228,7 +152,6 @@ bool TomahawkImporter::BuildBCF(void){ << " variants to " << Helpers::NumberThousandsSeparator(std::to_string(this->writer_.blocksWritten())) << " blocks..." << std::endl; - return true; } @@ -237,37 +160,37 @@ bool TomahawkImporter::BuildVCF(void){ std::cerr << Helpers::timestamp("ERROR","VCF") << "Failed to open file..." << std::endl; return false; } - this->header_ = new header_type; + this->vcf_header_ = new vcf_header_type; - if(!this->header_->parse(this->reader_)){ + if(!this->vcf_header_->parse(this->reader_)){ std::cerr << Helpers::timestamp("ERROR","VCF") << "Failed to parse VCF..." << std::endl; exit(1); } - if(!this->header_->good()){ - std::cerr << Helpers::timestamp("ERROR","VCF") << "Failed to parse VCF (" << this->header_->error_bit << ")..." << std::endl; + if(!this->vcf_header_->good()){ + std::cerr << Helpers::timestamp("ERROR","VCF") << "Failed to parse VCF (" << this->vcf_header_->error_bit << ")..." << std::endl; return false; } - if(this->header_->samples == 0){ + if(this->vcf_header_->samples == 0){ std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "No samples detected..." << std::endl; return false; } - if(this->header_->samples == 1){ + if(this->vcf_header_->samples == 1){ std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "Cannot run " << Tomahawk::Constants::PROGRAM_NAME << " with a single sample..." << std::endl; return false; } // Spawn RLE controller - this->rle_controller = new rle_controller_type(this->header_->samples); + this->rle_controller = new rle_controller_type(this->vcf_header_->samples); this->rle_controller->DetermineBitWidth(); // Parse lines - line_type line(this->header_->size()); + vcf_entry_type line(this->vcf_header_->size()); this->reader_.clear(); - this->writer_.setHeader(*this->header_); - if(!this->writer_.Open(this->outputPrefix)) + this->writer_.setHeader(*this->vcf_header_); + if(!this->writer_.Open(this->output_prefix)) return false; if(!this->reader_.getLine()){ @@ -281,7 +204,7 @@ bool TomahawkImporter::BuildVCF(void){ } // Try to get contig information from header - if(!this->header_->getContig(std::string(line.CHROM, line.lCHROM), this->sort_order_helper.contigID)){ + if(!this->vcf_header_->getContig(std::string(line.CHROM, line.lCHROM), this->sort_order_helper.contigID)){ std::cerr << Helpers::timestamp("ERROR", "VCF") << "Contig does not exist in header..." << std::endl; return false; } @@ -308,11 +231,14 @@ bool TomahawkImporter::BuildVCF(void){ return false; } - ++this->header_->getContig(*this->sort_order_helper.contigID); + ++this->vcf_header_->getContig(*this->sort_order_helper.contigID); this->writer_.flush(); - // return false; + // Update container with this totempole entry + this->index += this->writer_.totempole_entry; - this->writer_.WriteFinal(); + // return false; + this->index.buildMetaIndex(this->vcf_header_->contigs.size()); + this->writer_.WriteFinal(this->index, this->footer_); if(this->writer_.GetVariantsWritten() == 0){ std::cerr << Helpers::timestamp("ERROR","IMPORT") << "Did not import any variants..." << std::endl; @@ -324,7 +250,7 @@ bool TomahawkImporter::BuildVCF(void){ << " variants to " << Helpers::NumberThousandsSeparator(std::to_string(this->writer_.blocksWritten())) << " blocks..." << std::endl; - delete this->header_; + delete this->vcf_header_; return true; } @@ -332,39 +258,42 @@ bool TomahawkImporter::BuildVCF(void){ bool TomahawkImporter::parseBCFLine(bcf_entry_type& line){ if(this->sort_order_helper.prevcontigID != line.body->CHROM){ if(line.body->CHROM < this->sort_order_helper.prevcontigID){ - std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "Contigs are not sorted (" << (*this->header_)[this->sort_order_helper.prevcontigID].name << " > " << (*this->header_)[line.body->CHROM].name << ")..." << std::endl; + std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "Contigs are not sorted (" << (*this->vcf_header_)[this->sort_order_helper.prevcontigID].name << " > " << (*this->vcf_header_)[line.body->CHROM].name << ")..." << std::endl; exit(1); } if(!SILENT) - std::cerr << Helpers::timestamp("LOG", "IMPORT") << "Switch detected: " << this->header_->getContig(this->sort_order_helper.prevcontigID).name << "->" << this->header_->getContig(line.body->CHROM).name << "..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "IMPORT") << "Switch detected: " << this->vcf_header_->getContig(this->sort_order_helper.prevcontigID).name << "->" << this->vcf_header_->getContig(line.body->CHROM).name << "..." << std::endl; this->sort_order_helper.previous_position = 0; // Get new contig value from header // and flush out data - ++this->header_->getContig(line.body->CHROM); + ++this->vcf_header_->getContig(line.body->CHROM); this->writer_.flush(); + // Update container with this totempole entry + this->index += this->writer_.totempole_entry; + // Update index values this->writer_.TotempoleSwitch(line.body->CHROM, 0); } // Assert position is in range - if(line.body->POS+1 > this->header_->getContig(line.body->CHROM).length){ - std::cerr << Helpers::timestamp("ERROR", "IMPORT") << (*this->header_)[line.body->CHROM].name << ':' << line.body->POS+1 << " > reported max size of contig (" << (*this->header_)[line.body->CHROM].length << ")..." << std::endl; + if(line.body->POS+1 > this->vcf_header_->getContig(line.body->CHROM).length){ + std::cerr << Helpers::timestamp("ERROR", "IMPORT") << (*this->vcf_header_)[line.body->CHROM].name << ':' << line.body->POS+1 << " > reported max size of contig (" << (*this->vcf_header_)[line.body->CHROM].length << ")..." << std::endl; return false; } // Assert file is ordered if(line.body->POS < this->sort_order_helper.previous_position){ - std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "File is not sorted by coordinates (" << (*this->header_)[line.body->CHROM].name << ':' << line.body->POS+1 << " > " << (*this->header_)[line.body->CHROM].name << ':' << this->sort_order_helper.previous_position << ")..." << std::endl; + std::cerr << Helpers::timestamp("ERROR", "IMPORT") << "File is not sorted by coordinates (" << (*this->vcf_header_)[line.body->CHROM].name << ':' << line.body->POS+1 << " > " << (*this->vcf_header_)[line.body->CHROM].name << ':' << this->sort_order_helper.previous_position << ")..." << std::endl; return false; } // Assess missingness - const double missing = line.getMissingness(this->header_->samples); + const double missing = line.getMissingness(this->vcf_header_->samples); //const float missing = 0; if(line.body->POS == this->sort_order_helper.previous_position && line.body->CHROM == this->sort_order_helper.prevcontigID){ if(this->sort_order_helper.previous_included){ @@ -388,12 +317,14 @@ bool TomahawkImporter::parseBCFLine(bcf_entry_type& line){ goto next; } - // Flush if output block is over some size if(this->writer_.checkSize()){ - ++this->header_->getContig(line.body->CHROM); // update block count for this contigID + ++this->vcf_header_->getContig(line.body->CHROM); // update block count for this contigID this->writer_.flush(); + // Update container with this totempole entry + this->index += this->writer_.totempole_entry; + this->writer_.TotempoleSwitch(line.body->CHROM, this->sort_order_helper.previous_position); } if(this->writer_.add(line)) @@ -410,7 +341,7 @@ bool TomahawkImporter::parseBCFLine(bcf_entry_type& line){ return true; } -bool TomahawkImporter::parseVCFLine(line_type& line){ +bool TomahawkImporter::parseVCFLine(vcf_entry_type& line){ // Parse a VCF line if(!line.Parse(&this->reader_[0], this->reader_.size())){ std::cerr << Helpers::timestamp("ERROR", "VCF") << "Could not parse..." << std::endl; @@ -418,7 +349,7 @@ bool TomahawkImporter::parseVCFLine(line_type& line){ } // Try to get contig information from header - if(!this->header_->getContig(std::string(line.CHROM, line.lCHROM), this->sort_order_helper.contigID)){ + if(!this->vcf_header_->getContig(std::string(line.CHROM, line.lCHROM), this->sort_order_helper.contigID)){ std::cerr << Helpers::timestamp("ERROR", "VCF") << "Contig does not exist in header..." << std::endl; return false; } @@ -426,40 +357,43 @@ bool TomahawkImporter::parseVCFLine(line_type& line){ // Switch in chromosome detected if(this->sort_order_helper.prevcontigID != *this->sort_order_helper.contigID){ if(*this->sort_order_helper.contigID < this->sort_order_helper.prevcontigID){ - std::cerr << Helpers::timestamp("ERROR", "VCF") << "Contigs are not sorted (" << (*this->header_)[this->sort_order_helper.prevcontigID].name << " > " << (*this->header_)[*this->sort_order_helper.contigID].name << ")..." << std::endl; + std::cerr << Helpers::timestamp("ERROR", "VCF") << "Contigs are not sorted (" << (*this->vcf_header_)[this->sort_order_helper.prevcontigID].name << " > " << (*this->vcf_header_)[*this->sort_order_helper.contigID].name << ")..." << std::endl; exit(1); } if(!SILENT) - std::cerr << Helpers::timestamp("LOG", "VCF") << "Switch detected: " << this->header_->getContig(this->sort_order_helper.prevcontigID).name << "->" << this->header_->getContig(*this->sort_order_helper.contigID).name << "..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "VCF") << "Switch detected: " << this->vcf_header_->getContig(this->sort_order_helper.prevcontigID).name << "->" << this->vcf_header_->getContig(*this->sort_order_helper.contigID).name << "..." << std::endl; this->sort_order_helper.previous_position = 0; // Get new contig value from header // and flush out data - ++this->header_->getContig(*this->sort_order_helper.contigID); + ++this->vcf_header_->getContig(*this->sort_order_helper.contigID); this->writer_.flush(); + // Update container with this totempole entry + this->index += this->writer_.totempole_entry; + // Update index values this->writer_.TotempoleSwitch(*this->sort_order_helper.contigID, 0); } // Assert position is in range - if(line.position > this->header_->getContig(*this->sort_order_helper.contigID).length){ - std::cerr << Helpers::timestamp("ERROR", "VCF") << (*this->header_)[*this->sort_order_helper.contigID].name << ':' << line.position << " > reported max size of contig (" << (*this->header_)[*this->sort_order_helper.contigID].length << ")..." << std::endl; + if(line.position > this->vcf_header_->getContig(*this->sort_order_helper.contigID).length){ + std::cerr << Helpers::timestamp("ERROR", "VCF") << (*this->vcf_header_)[*this->sort_order_helper.contigID].name << ':' << line.position << " > reported max size of contig (" << (*this->vcf_header_)[*this->sort_order_helper.contigID].length << ")..." << std::endl; return false; } // Assert file is ordered if(line.position < this->sort_order_helper.previous_position){ - std::cerr << Helpers::timestamp("ERROR", "VCF") << "File is not sorted by coordinates (" << (*this->header_)[*this->sort_order_helper.contigID].name << ':' << line.position << " > " << (*this->header_)[*this->sort_order_helper.contigID].name << ':' << this->sort_order_helper.previous_position << ")..." << std::endl; + std::cerr << Helpers::timestamp("ERROR", "VCF") << "File is not sorted by coordinates (" << (*this->vcf_header_)[*this->sort_order_helper.contigID].name << ':' << line.position << " > " << (*this->vcf_header_)[*this->sort_order_helper.contigID].name << ':' << this->sort_order_helper.previous_position << ")..." << std::endl; return false; } // Execute only if the line is simple (biallelic and SNP) if(line.IsSimple()){ // Only check missing if simple - const double missing = line.getMissingness(this->header_->samples); + const double missing = line.getMissingness(this->vcf_header_->samples); if(line.position == this->sort_order_helper.previous_position && *this->sort_order_helper.contigID == this->sort_order_helper.prevcontigID){ if(this->sort_order_helper.previous_included){ //if(!SILENT) @@ -482,9 +416,12 @@ bool TomahawkImporter::parseVCFLine(line_type& line){ // Flush if output block is over some size if(this->writer_.checkSize()){ - ++this->header_->getContig(*this->sort_order_helper.contigID); // update block count for this contigID + ++this->vcf_header_->getContig(*this->sort_order_helper.contigID); // update block count for this contigID this->writer_.flush(); + // Update container with this totempole entry + this->index += this->writer_.totempole_entry; + this->writer_.TotempoleSwitch(*this->sort_order_helper.contigID, this->sort_order_helper.previous_position); } if(this->writer_.add(line)) diff --git a/src/tomahawk/TomahawkImporter.h b/src/tomahawk/TomahawkImporter.h index bf1cfda..9bce77b 100644 --- a/src/tomahawk/TomahawkImporter.h +++ b/src/tomahawk/TomahawkImporter.h @@ -4,81 +4,109 @@ #include "../io/reader.h" #include "../io/vcf/VCFHeader.h" #include "../io/bcf/BCFReader.h" -#include "TomahawkImporterFilters.h" -#include "TomahawkImportWriter.h" +#include "import_filters.h" +#include "import_writer.h" +#include "../index/index.h" namespace Tomahawk { +/**< + * This class handles importing `bcf`/`vcf` into the `twk` file format. + */ class TomahawkImporter { - typedef TomahawkImporter self_type; - typedef reader reader_type; - typedef VCF::VCFHeader header_type; - typedef TomahawkImportWriter writer_type; - typedef VCF::VCFLine line_type; - typedef IO::BasicBuffer buffer_type; - typedef Algorithm::TomahawkImportRLE rle_controller_type; - typedef Totempole::TotempoleEntry totempole_entry_type; - typedef BCF::BCFReader bcf_reader_type; - typedef BCF::BCFEntry bcf_entry_type; - typedef TomahawkImporterFilters filter_type; + typedef TomahawkImporter self_type; + typedef reader reader_type; + typedef ImportWriter writer_type; + typedef ImporterFilters filter_type; + typedef IO::BasicBuffer buffer_type; + typedef Algorithm::GenotypeEncoder rle_controller_type; + typedef Totempole::IndexEntry totempole_entry_type; + typedef VCF::VCFHeader vcf_header_type; + typedef VCF::VCFLine vcf_entry_type; + typedef BCF::BCFReader bcf_reader_type; + typedef BCF::BCFEntry bcf_entry_type; + typedef Index index_type; + typedef Totempole::Footer footer_type; - /* - This supportive structure keeps track of the current and - previous contig identifiers and the previous obseved position. - This information is necessary to guarantee the sort-order of - the output Tomahawk file required for indexing. - The flag previous_included is triggered whenever an entry is - not filtered out. It is used when two or more entries share the - same position. In this case, if the preceding line was included - then ignore the current one. Otherwise, the preceding line was - filtered out and the include the current one. - Note that contigID is a pointer as this is required by our - hash-table implementation as a return value + /**< + * This supportive structure keeps track of the current and + * previous contig identifiers and the previous obseved position. + * This information is necessary to guarantee the sort-order of + * the output Tomahawk file required for indexing. + * The flag previous_included is triggered whenever an entry is + * not filtered out. It is used when two or more entries share the + * same position. In this case, if the preceding line was included + * then ignore the current one. Otherwise, the preceding line was + * filtered out and the include the current one. + * Note that contigID is a pointer as this is required by our + * hash-table implementation as a return value */ - struct __InternalHelper{ + struct __InternalHelper { __InternalHelper(): contigID(nullptr), prevcontigID(-1), previous_position(-1), previous_included(false) {} - S32* contigID; // current contigID - S32 prevcontigID; // previous contigID - S32 previous_position; // current position + S32* contigID; // current contigID + S32 prevcontigID; // previous contigID + S32 previous_position; // current position bool previous_included; } sort_order_helper; public: - TomahawkImporter(std::string inputFile, std::string outputPrefix); + TomahawkImporter(const std::string inputFile, const std::string outputPrefix); ~TomahawkImporter(); + + /**< + * Primary import function for data. The function internally checks + * the target input file type (`bcf`/`vcf`). + * @return Returns TRUE upon success or FALSE otherwise + */ bool Build(); + + /**< + * Extends an existing `twk` file with a target input file. + * Warning: this function does NOT check if the file headers + * are in order! If they are not in order the `twk` file will + * be corrupted + * @param extendFile Target `twk` file to extend + * @return Returns TRUE upon success or FALSE otherwise + */ bool Extend(std::string extendFile); filter_type& getFilters(void){ return(this->filters); } private: + // Basic import funtionality bool BuildVCF(); // import a VCF file bool BuildBCF(); // import a BCF file + + // Extend existing `twk` file with data from a `vcf`/`bcf` file bool ExtendVCF(); // extend a Twk file with a VCF file bool ExtendBCF(); // extend a Twk file with a BCF file - bool parseVCFLine(line_type& line); // Import a VCF line + // Parse a `bcf`/`vcf` line + bool parseVCFLine(vcf_entry_type& line); // Import a VCF line bool parseBCFLine(bcf_entry_type& line); // Import a BCF line + // Check if the current meta and RLE buffers exceeds // the disk flush limit - bool checkSize(void) const{ return(this->meta_buffer.size() + this->rle_buffer.size() >= this->block_flush_limit); } + inline bool checkSize(void) const{ return(this->meta_buffer.size() + this->rle_buffer.size() >= this->block_flush_limit); } private: - U32 block_flush_limit; // limit in bytes when to flush to disk - std::string inputFile; // input file name - std::string outputPrefix; // output file prefix - reader_type reader_; // reader - writer_type writer_; // writer - buffer_type meta_buffer; // meta buffer - buffer_type rle_buffer; // RLE buffer - totempole_entry_type totempole_entry; // totempole entry for indexing - filter_type filters; - header_type* header_; // header - rle_controller_type* rle_controller; // RLE packer + U32 block_flush_limit;// limit in bytes when to flush to disk + std::string input_file; // input file name + std::string output_prefix; // output file prefix + reader_type reader_; // reader + writer_type writer_; // writer + buffer_type meta_buffer; // meta buffer + buffer_type rle_buffer; // RLE buffer + totempole_entry_type totempole_entry; // current (active) index entry + filter_type filters; // filters + index_type index; + footer_type footer_; + vcf_header_type* vcf_header_; // vcf header + rle_controller_type* rle_controller; // RLE packer algorithms }; diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputEntry.h b/src/tomahawk/TomahawkOutput/TomahawkOutputEntry.h deleted file mode 100644 index 7baa506..0000000 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputEntry.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef TOMAHAWKOUTPUTENTRY_H_ -#define TOMAHAWKOUTPUTENTRY_H_ - -#include "../../io/BasicBuffer.h" -#include "../../totempole/TotempoleContig.h" - -namespace Tomahawk{ -namespace IO{ - -#pragma pack(1) -struct TomahawkOutputEntry{ - typedef TomahawkOutputEntry self_type; - typedef Totempole::TotempoleContigBase contig_type; - - TomahawkOutputEntry(){}; - ~TomahawkOutputEntry(){}; - TomahawkOutputEntry(const self_type* const other){ - memcpy(this, other, sizeof(self_type)); - } - - // Comparator function - // Called from sort helper only - bool operator<(const self_type& other) const{ - if (this->AcontigID < other.AcontigID) return true; - if (other.AcontigID < this->AcontigID) return false; - - if (this->Aposition < other.Aposition) return true; - if (other.Aposition < this->Aposition) return false; - - if (this->BcontigID < other.BcontigID) return true; - if (other.BcontigID < this->BcontigID) return false; - - if (this->Bposition < other.Bposition) return true; - if (other.Bposition < this->Bposition) return false; - - return false; - } - - // Comparator function: inverse of lesser comparator - bool operator>(const self_type& other){ return(!((*this) < other)); } - - friend std::ostream& operator<<(std::ostream& os, const self_type& entry){ - os << std::setprecision(8) << (int)entry.FLAGS << '\t' << entry.AcontigID << '\t' << entry.Aposition << '\t' << entry.BcontigID << '\t' << entry.Bposition - << '\t' << entry.p1 << '\t' << entry.p2 << '\t' << entry.q1 << '\t' << entry.q2 << '\t' << entry.D << '\t' << entry.Dprime - << '\t' << entry.R2 << '\t' << entry.P << '\t' << entry.chiSqFisher << '\t' << entry.chiSqModel; - - return(os); - } - - std::ostream& write(std::ostream& os, const contig_type* const contigs) const{ - os << std::setprecision(8) << (int)this->FLAGS << '\t' << contigs[this->AcontigID].name << '\t' << this->Aposition << '\t' << contigs[this->BcontigID].name << '\t' << this->Bposition - << '\t' << this->p1 << '\t' << this->p2 << '\t' << this->q1 << '\t' << this->q2 << '\t' << this->D << '\t' << this->Dprime - << '\t' << this->R2 << '\t' << this->P << '\t' << this->chiSqFisher << '\t' << this->chiSqModel << '\n'; - - return(os); - } - - friend IO::BasicBuffer& operator<<(IO::BasicBuffer& b, const self_type& entry){ - b.Add(reinterpret_cast(&entry), sizeof(self_type)); - return(b); - } - - // Swaps cA,pA with cB,pB - // used in sorting for indices - void swapDirection(void){ - std::swap(this->AcontigID, this->BcontigID); - U32& A = *reinterpret_cast(((char*)this + sizeof(U16) + sizeof(U32))); - U32& B = *reinterpret_cast(((char*)this + sizeof(U16) + 3*sizeof(U32))); - std::swap(A,B); - } - - U16 FLAGS; - U32 AcontigID; - U32 Amissing: 1, Aphased: 1, Aposition: 30; - U32 BcontigID; - U32 Bmissing: 1, Bphased: 1, Bposition: 30; - float p1, p2, q1, q2; - float D, Dprime; - float R2; - double P; - double chiSqFisher; - double chiSqModel; -}; - - -// Sort reinterpreted casts of data -// Workaround is based on data being reinterpreted as -// entries from byte streams. Therefore regular sorting -// is illegal. Instead a BYTE array literals stored as -// a hard copy and reinterpreted as an entry in the -// overloaded operator< -#pragma pack(1) -struct TomahawkOutputEntrySort{ - typedef TomahawkOutputEntrySort self_type; - typedef TomahawkOutputEntry parent_type; - - TomahawkOutputEntrySort(){} - TomahawkOutputEntrySort(const self_type& other){ - memcpy(this->data, other.data, sizeof(parent_type)); - } - TomahawkOutputEntrySort(self_type&& other) noexcept{ std::swap(this->data, other.data); } - TomahawkOutputEntrySort& operator=(const self_type& other){ - self_type tmp(other); // re-use copy-constructor - *this = std::move(tmp); // re-use move-assignment - return *this; - } - TomahawkOutputEntrySort& operator=(self_type&& other) noexcept{ - std::swap(this->data, other.data); - return *this; - } - ~TomahawkOutputEntrySort(){ } - - bool operator<(const self_type& other) const{ - const parent_type& self_parent = *reinterpret_cast(&this->data[0]); - const parent_type& other_parent = *reinterpret_cast(&other.data[0]); - return(self_parent < other_parent); - } - - BYTE data[sizeof(parent_type)]; // reinterpret me as entry -}; - -// comparator functions for output entry -namespace Support{ - -static inline bool TomahawkOutputEntryCompFuncConst(const TomahawkOutputEntry& self, const TomahawkOutputEntry& other){ - if (self.AcontigID < other.AcontigID) return true; - if (other.AcontigID < self.AcontigID) return false; - - if (self.Aposition < other.Aposition) return true; - if (other.Aposition < self.Aposition) return false; - - if (self.BcontigID < other.BcontigID) return true; - if (other.BcontigID < self.BcontigID) return false; - - if (self.Bposition < other.Bposition) return true; - if (other.Bposition < self.Bposition) return false; - - return false; -} - -} -} -} - -#endif /* TOMAHAWKOUTPUTENTRY_H_ */ diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputManager.h b/src/tomahawk/TomahawkOutput/TomahawkOutputManager.h deleted file mode 100644 index d736156..0000000 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputManager.h +++ /dev/null @@ -1,237 +0,0 @@ -#ifndef TOMAHAWK_TOMAHAWKOUTPUTMANAGER_H_ -#define TOMAHAWK_TOMAHAWKOUTPUTMANAGER_H_ - -#include "../../io/BasicWriters.h" -#include "../../io/compression/TGZFController.h" -#include "../../support/MagicConstants.h" -#include "../../totempole/TotempoleContig.h" -#include "../../totempole/TotempoleMagic.h" -#include "../../totempole/TotempoleOutputEntry.h" -#include "../TomahawkBlockManager.h" -#include "TomahawkOutputEntry.h" -#include "TomahawkOutputLD.h" - -#define SLAVE_FLUSH_LIMIT 10000000 // 10 MB - -namespace Tomahawk{ -namespace IO { - -template -struct TomahawkOutputManager{ - typedef TomahawkOutputManager self_type; - typedef IO::WriterFile writer_type; - typedef TomahawkBlock controller_type; - typedef Tomahawk::Support::TomahawkOutputLD helper_type; - typedef IO::BasicBuffer buffer_type; - typedef TGZFController tgzf_controller; - typedef IO::TomahawkOutputEntry entry_type; - typedef Totempole::TotempoleOutputEntry totempoly_entry; - typedef Totempole::TotempoleOutputEntryController totempole_controller_byte; - -public: - TomahawkOutputManager() : - outCount(0), - progressCount(0), - totempole_blocks_written(0), - writer(nullptr), - writer_index(nullptr), - buffer(2*SLAVE_FLUSH_LIMIT), - sprintf_buffer(new char[255]) - { - - } - - ~TomahawkOutputManager(){ - this->flushBlock(); - this->buffer.deleteAll(); - delete [] this->sprintf_buffer; - } - - TomahawkOutputManager(const self_type& other) : - outCount(0), - progressCount(0), - totempole_blocks_written(0), - writer(other.writer), - writer_index(other.writer_index), - buffer(2*SLAVE_FLUSH_LIMIT), - sprintf_buffer(new char[255]) - { - } - - self_type& operator+=(const self_type& other){ - this->outCount += other.outCount; - this->totempole_blocks_written += other.totempole_blocks_written; - return(*this); - } - - self_type& operator=(const U32 totempole_blocks){ - this->totempole_blocks_written = totempole_blocks; - return(*this); - } - - inline const U64& GetCounts(void) const{ return this->outCount; } - inline void ResetProgress(void){ this->progressCount = 0; } - inline const U32& GetProgressCounts(void) const{ return this->progressCount; } - inline const U32& getTotempoleBlocks(void) const{ return(this->totempole_blocks_written); } - - bool Open(const std::string output, Totempole::TotempoleReader& totempole){ - if(output.size() == 0) - return false; - - this->writer = new writer_type; - this->writer_index = new writer_type; - - this->CheckOutputNames(output); - this->filename = output; - if(!this->writer->open(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX)){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to open..." << std::endl; - return false; - } - - if(!this->writer_index->open(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX)){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed open index..." << std::endl; - return false; - } - - if(!this->WriteHeader(totempole)){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to write header" << std::endl; - return false; - } - - return true; - } - - void flushBlock(void){ - if(this->buffer.size() > 0){ - if(!this->compressor.Deflate(this->buffer)){ - std::cerr << Helpers::timestamp("ERROR","TGZF") << "Failed deflate DATA..." << std::endl; - exit(1); - } - - this->writer->getLock()->lock(); - this->entry.byte_offset = (U64)this->writer->getNativeStream().tellp(); - this->entry.uncompressed_size = this->buffer.size(); - this->writer->writeNoLock(this->compressor.buffer); - this->entry.byte_offset_end = (U64)this->writer->getNativeStream().tellp(); - this->writer_index->getNativeStream() << this->entry; - ++this->totempole_blocks_written; - //std::cerr << this->entry << std::endl; - this->writer->getLock()->unlock(); - - this->buffer.reset(); - this->compressor.Clear(); - this->entry.reset(); - } - } - - bool finalise(void){ - // Make sure data is flushed - this->writer->flush(); - this->writer_index->flush(); - - // Update blocks written - std::fstream re(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX, std::ios::in | std::ios::out | std::ios::binary); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to reopen index..." << std::endl; - return false; - } - - re.seekg(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC_LENGTH + sizeof(float) + sizeof(U64) + sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to seek in index..." << std::endl; - return false; - } - - re.write((char*)&this->totempole_blocks_written, sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to update counts in index..." << std::endl; - return false; - } - re.flush(); - re.close(); - - return true; - } - - void Add(const controller_type& a, const controller_type& b, const helper_type& helper){ - const U32 writePosA = a.meta[a.metaPointer].position << 2 | a.meta[a.metaPointer].phased << 1 | a.meta[a.metaPointer].missing; - const U32 writePosB = b.meta[b.metaPointer].position << 2 | b.meta[b.metaPointer].phased << 1 | b.meta[b.metaPointer].missing; - this->buffer += helper.controller; - this->buffer += a.support->contigID; - this->buffer += writePosA; - this->buffer += b.support->contigID; - this->buffer += writePosB; - this->buffer << helper; - ++this->outCount; - ++this->progressCount; - ++this->entry.entries; - - if(this->buffer.size() > SLAVE_FLUSH_LIMIT) - this->flushBlock(); - } - - void close(void){ - this->writer->flush(); - this->writer_index->flush(); - this->writer->close(); - this->writer_index->close(); - delete this->writer; - delete this->writer_index; - } - -private: - bool WriteHeader(Totempole::TotempoleReader& totempole){ - //typedef TomahawkOutputHeader header_type; - std::ofstream& stream = this->writer->getNativeStream(); - std::ofstream& stream_index = this->writer_index->getNativeStream(); - - TomahawkOutputHeader head(Tomahawk::Constants::WRITE_HEADER_LD_MAGIC, totempole.getSamples(), totempole.getContigs()); - TomahawkOutputSortHeader headIndex(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC, totempole.getSamples(), totempole.getContigs()); - stream << head; - stream_index << headIndex; - - // Write contig data to TWO - // length | n_char | chars[0 .. n_char - 1] - for(U32 i = 0; i < totempole.getContigs(); ++i) - stream << *totempole.getContigBase(i); - - if(!totempole.writeLiterals(stream)){ - std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to write literals..." << std::endl; - return false; - } - - return(stream.good()); - } - - void CheckOutputNames(const std::string& input){ - std::vector paths = Helpers::filePathBaseExtension(input); - this->basePath = paths[0]; - if(this->basePath.size() > 0) - this->basePath += '/'; - - if(paths[3].size() == Tomahawk::Constants::OUTPUT_LD_SUFFIX.size() && strncasecmp(&paths[3][0], &Tomahawk::Constants::OUTPUT_LD_SUFFIX[0], Tomahawk::Constants::OUTPUT_LD_SUFFIX.size()) == 0) - this->baseName = paths[2]; - else this->baseName = paths[1]; - } - - -private: - std::string filename; - std::string basePath; - std::string baseName; - - U64 outCount; // lines written - U32 progressCount; // lines added since last flush - U32 totempole_blocks_written; - totempoly_entry entry; // track stuff - writer_type* writer; // writer - writer_type* writer_index; // writer index - buffer_type buffer; // internal buffer - tgzf_controller compressor;// compressor - char* sprintf_buffer; // special buffer used for sprintf writing scientific output in natural mode -}; - -} -} - -#endif /* TOMAHAWK_TOMAHAWKOUTPUTMANAGER_H_ */ diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputReader.h b/src/tomahawk/TomahawkOutput/TomahawkOutputReader.h deleted file mode 100644 index a297329..0000000 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputReader.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef TOMAHAWKOUTPUTREADER_H_ -#define TOMAHAWKOUTPUTREADER_H_ - -#include -#include -#include -#include -#include - -#include "../../support/TypeDefinitions.h" -#include "../../io/BasicBuffer.h" -#include "../../io/compression/TGZFController.h" -#include "../../support/MagicConstants.h" -#include "../../algorithm/OpenHashTable.h" -#include "../../totempole/TotempoleMagic.h" -#include "../../third_party/intervalTree.h" -#include "../../totempole/TotempoleOutputReader.h" -#include "TomahawkOutputEntry.h" -#include "TomahawkOutputFilterController.h" -#include "TomahawkOutputWriter.h" - -namespace Tomahawk { -namespace IO { - -class TomahawkOutputReader { - typedef TomahawkOutputEntry entry_type; - typedef TomahawkOutputFilterController filter_type; - typedef Tomahawk::IO::TomahawkOutputWriterInterface writer_type; - typedef TomahawkOutputHeader header_type; - typedef Totempole::TotempoleContigBase contig_type; - typedef TGZFHeader tgzf_type; - typedef Hash::HashTable hash_table; - typedef IO::TGZFController tgzf_controller_type; - typedef Tomahawk::Algorithm::ContigInterval interval_type; - typedef Tomahawk::Algorithm::IntervalTree tree_type; - typedef Totempole::TotempoleOutputSortedEntry totempole_sorted_entry_type; - typedef IO::TomahawkOutputWriterIndex twoi_writer_type; - typedef Tomahawk::IO::TomahawkOutputSortHeader toi_header_type; - - -public: - typedef Totempole::TotempoleOutputReader toi_reader_type; - enum WRITER_TYPE {binary, natural}; - -public: - TomahawkOutputReader(); - ~TomahawkOutputReader(); - - const entry_type* operator[](const U32 p) const{ return(reinterpret_cast(&this->output_buffer.data[sizeof(entry_type)*p])); } - - // Streaming functions - bool getBlock(const U32 blockID); - bool getBlock(std::vector< std::pair >& pairs); - bool AddRegions(std::vector& positions); - bool Open(const std::string input); - bool OpenExtend(const std::string input); - bool nextBlock(const bool clear = true); - bool nextVariant(const entry_type*& entry); - bool nextVariantLimited(const entry_type*& entry); - bool nextBlockUntil(const U32 limit); - bool nextBlockUntil(const U32 limit, const U64 virtual_offset); - inline void addLiteral(const std::string& string){ this->literals += string; } - - // Other - bool view(const std::string& filename); - bool index(const std::string& filename); - bool summary(const std::string& input, const U32 bins); - - // Concatenate - bool concat(const std::string& file_list, const std::string& output); - bool concat(const std::vector& files, const std::string& output); - - // - bool setWriterType(const int type); - void setWriteHeader(const bool write){ this->output_header = write; } - - filter_type& getFilter(void){ return this->filter; } - bool OpenWriter(void); - bool OpenWriter(const std::string output_file); - -private: - bool __Open(const std::string input); - bool ParseHeader(void); - bool ParseHeaderExtend(void); - bool __ParseRegion(const std::string& region, interval_type& interval); - bool __ParseRegionIndexed(const std::string& region, interval_type& interval); - bool __ParseRegionIndexedBlocks(void); - bool __viewOnly(void); - bool __viewFilter(void); - bool __viewRegion(void); - bool __viewRegionIndexed(void); - bool __checkRegionIndex(const entry_type* const entry); - bool __checkRegionNoIndex(const entry_type* const entry); - bool __concat(const std::vector& files, const std::string& output); - - bool AddRegionsIndexed(std::vector& positions); - bool AddRegionsUnindexed(std::vector& positions); - -public: - U64 filesize; // input file size - U64 position; - U64 size; - bool hasIndex; - std::ifstream stream; // reader stream - header_type header; // header - bool output_header; - IO::BasicBuffer buffer; // internal buffer - IO::BasicBuffer output_buffer; // internal buffer - tgzf_controller_type gzip_controller; // TGZF controller - filter_type filter; // filter parameters - WRITER_TYPE writer_output_type; - std::string literals; // header literals - writer_type* writer; // writer interface - contig_type* contigs; - hash_table* contig_htable; // map input string to internal contigID - tree_type** interval_tree; - std::vector* interval_tree_entries; - std::vector* interval_totempole_enties; - toi_reader_type toi_reader; -}; - -} -} /* namespace Tomahawk */ - -#endif /* TOMAHAWKOUTPUTREADER_H_ */ diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputWriter.h b/src/tomahawk/TomahawkOutput/TomahawkOutputWriter.h deleted file mode 100644 index b0a389d..0000000 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputWriter.h +++ /dev/null @@ -1,433 +0,0 @@ -#ifndef TOMAHAWKOUTPUTWRITER_H_ -#define TOMAHAWKOUTPUTWRITER_H_ - -#include "../../io/BasicWriters.h" -#include "../../totempole/TotempoleMagic.h" -#include "../../totempole/TotempoleOutputEntry.h" - -namespace Tomahawk{ -namespace IO { - -class TomahawkOutputWriterInterface { -private: - typedef TomahawkOutputWriterInterface self_type; - -protected: - typedef GenericWriterInterace stream_type; - typedef IO::WriterStandardOut cout_type; - typedef IO::WriterFile file_type; - typedef IO::BasicBuffer buffer_type; - -protected: - typedef TomahawkOutputEntry entry_type; - typedef TomahawkOutputHeader header_type; - typedef Totempole::TotempoleContigBase contig_type; - -public: - typedef Algorithm::SpinLock lock_type; - -public: - TomahawkOutputWriterInterface(const contig_type* contigs, const header_type* header) : - stream(nullptr), - header(header), - contigs(contigs) - {} - virtual ~TomahawkOutputWriterInterface(){ delete this->stream; } - - virtual bool open(void){ - this->stream = new cout_type(); - if(!this->stream->open()) - return false; - - return true; - } - - virtual bool open(const std::string output){ - if(output.size() == 0) - return(this->open()); - - this->outFile = output; - this->CheckOutputNames(output); - - this->stream = new file_type(); - if(!this->stream->open(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX)){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to open..." << std::endl; - return false; - } - - return true; - } - - virtual inline void flush(void){ this->stream->flush(); } - virtual inline bool close(void){ this->stream->close(); return true; } - virtual void operator<<(const entry_type* const entryentry) =0; - virtual void operator<<(const entry_type& entryentry) =0; - virtual inline void write(const char* data, const U32 length){ this->stream->write(&data[0], length); } - virtual inline void writeNoLock(const char* data, const U32 length){ this->stream->writeNoLock(&data[0], length); } - virtual inline const size_t write(buffer_type& buffer){ return(this->stream->write(&buffer.data[0], buffer.size())); } - virtual inline const size_t writeNoLock(buffer_type& buffer){ return(this->stream->writeNoLock(&buffer.data[0], buffer.size())); } - inline stream_type* getStream(void){ return(this->stream); } - inline lock_type* getLock(void){ return(this->stream->getLock()); } - - template - void write(const T& data){ - *reinterpret_cast(&this->stream->getStream()) << data; - } - - virtual void writeHeader(std::string& literals) =0; - virtual void writeEOF(void) =0; - - void CheckOutputNames(const std::string& input){ - std::vector paths = Helpers::filePathBaseExtension(input); - this->basePath = paths[0]; - if(this->basePath.size() > 0) - this->basePath += '/'; - - if(paths[3].size() == Tomahawk::Constants::OUTPUT_LD_SUFFIX.size() && strncasecmp(&paths[3][0], &Tomahawk::Constants::OUTPUT_LD_SUFFIX[0], Tomahawk::Constants::OUTPUT_LD_SUFFIX.size()) == 0) - this->baseName = paths[2]; - else this->baseName = paths[1]; - } - -protected: - std::string outFile; - std::string basePath; - std::string baseName; - - stream_type* stream; - const header_type* header; - const contig_type* contigs; -}; - -class TotempoleOutputIndexWriter : public TomahawkOutputWriterInterface{ - typedef TotempoleOutputIndexWriter self_type; - typedef TomahawkOutputWriterInterface parent_type; - typedef Totempole::TotempoleOutputEntry totempole_type; - typedef Tomahawk::IO::TomahawkOutputSortHeader toi_header_type; - typedef Totempole::TotempoleOutputSortedIndex index_type; - -public: - TotempoleOutputIndexWriter(); - ~TotempoleOutputIndexWriter(); - - bool Open(const std::string& input); - - // Add entire block - void update(const entry_type* entry, const U32 length, totempole_type& totempole){ - const entry_type* prev = entry; - for(U32 i = 0; i < length; ++i, ++entry){ - if(prev->AcontigID < entry->AcontigID || prev->Aposition <= entry->Aposition){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "File is not sorted" << std::endl; - exit(1); - } - this->index.update(*entry, this->current_blockID, i); - std::swap(prev, entry); - } - - // flush block - file_type& stream = *reinterpret_cast(this->stream); - stream.getNativeStream() << totempole; - } - - void writeHeader(std::string& literals){ - //this->stream_index.getNativeStream() << toi_header; - }; - -private: - U32 current_blockID; - toi_header_type toi_header; - index_type index; -}; - -// case binary -class TomahawkOutputWriter : public TomahawkOutputWriterInterface { -private: - typedef TomahawkOutputWriter self_type; - -protected: - typedef IO::BasicBuffer buffer_type; - typedef IO::TGZFController tgzf_controller_type; - -public: - TomahawkOutputWriter(const contig_type* contigs, const header_type* header) : TomahawkOutputWriterInterface(contigs, header), flush_limit(524288){} - TomahawkOutputWriter(const contig_type* contigs, const header_type* header, const U32 flush_limit) : - TomahawkOutputWriterInterface(contigs, header), - flush_limit(flush_limit), - buffer(flush_limit + 2048) - { - this->controller.buffer.resize(this->buffer); - } - - // if binary - // does not support cout writer - // also open two.twi - // if not ending in .two add .two - - virtual ~TomahawkOutputWriter(){ - // Flush upon termination - this->flush(); - this->writeEOF(); - this->close(); - this->buffer.deleteAll(); - } - - inline void flush(void){ - if(this->buffer.size() > 0){ - this->controller.Deflate(this->buffer); - this->stream->write(&this->controller.buffer[0], this->controller.buffer.size()); - this->controller.Clear(); - this->buffer.reset(); - } - this->stream->flush(); - } - virtual inline bool close(void){ this->stream->close(); return true; } - - virtual void operator<<(const entry_type* const entry){ - this->buffer.Add(reinterpret_cast(entry), sizeof(entry_type)); - if(this->buffer.size() > this->flush_limit){ - this->controller.Deflate(this->buffer); - this->stream->write(&this->controller.buffer[0], this->controller.buffer.size()); - this->controller.Clear(); - this->buffer.reset(); - } - } - - virtual void operator<<(const entry_type& entry){ - this->buffer.Add(reinterpret_cast(&entry), sizeof(entry_type)); - if(this->buffer.size() > this->flush_limit){ - this->controller.Deflate(this->buffer); - this->stream->write(&this->controller.buffer[0], this->controller.buffer.size()); - this->controller.Clear(); - this->buffer.reset(); - } - } - - virtual inline const size_t write(buffer_type& buffer){ - this->controller.Clear(); - this->controller.Deflate(buffer); - this->stream->write(&this->controller.buffer[0], this->controller.buffer.size()); - return(this->controller.buffer.size()); - } - - virtual inline const size_t writeNoLock(buffer_type& buffer){ - this->controller.Clear(); - this->controller.Deflate(buffer); - this->stream->writeNoLock(&this->controller.buffer[0], this->controller.buffer.size()); - return(this->controller.buffer.size()); - } - - void writeHeader(std::string& literals){ - std::ofstream& __stream = *reinterpret_cast(&this->stream->getStream()); - __stream << *this->header; - for(U32 i = 0; i < this->header->n_contig; ++i) - __stream << this->contigs[i]; - - buffer_type bufferInternal(&literals[0], literals.size()); - if(!this->controller.Deflate(bufferInternal)){ - std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to deflate!" << std::endl; - return; - } - - __stream.write(&this->controller.buffer.data[0], this->controller.buffer.pointer); - this->controller.Clear(); - }; - - U64 getUncompressedSize(void) const{ return(this->controller.buffer.size()); } - - // There is no EOF - void writeEOF(void){}; - -protected: - U32 flush_limit; - buffer_type buffer; - tgzf_controller_type controller; -}; - -// case binary + index -class TomahawkOutputWriterIndex : public TomahawkOutputWriter{ - typedef TomahawkOutputWriter parent_type; - typedef Totempole::TotempoleOutputEntry totempole_type; - typedef Tomahawk::IO::TomahawkOutputSortHeader toi_header_type; - typedef Totempole::TotempoleOutputSortedIndex index_type; - -public: - TomahawkOutputWriterIndex(const contig_type* contigs, const header_type* header, const toi_header_type& toi_header) : - TomahawkOutputWriter(contigs, header), - current_blockID(0), - toi_header(toi_header), - index(header->n_contig, contigs) - { - - } - - TomahawkOutputWriterIndex(const contig_type* contigs, const header_type* header) : - TomahawkOutputWriter(contigs, header), - current_blockID(0), - index(header->n_contig, contigs) - { - this->toi_header = toi_header_type(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC, header->samples, header->n_contig); - } - - TomahawkOutputWriterIndex(const contig_type* contigs, const header_type* header, const U32 flush_limit) : - TomahawkOutputWriter(contigs, header, flush_limit), - current_blockID(0), - index(header->n_contig, contigs) - { - this->toi_header = toi_header_type(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC, header->samples, header->n_contig); - } - - ~TomahawkOutputWriterIndex(){} - - bool open(const std::string output){ - if(output.size() == 0){ - std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Writing to cout is illegal..." << std::endl; - return false; - } - - this->outFile = output; - this->CheckOutputNames(output); - - this->stream = new file_type(); - if(!this->stream->open(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX)){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to open..." << std::endl; - return false; - } - - if(!this->stream_index.open(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX)){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed open index..." << std::endl; - return false; - } - - return true; - } - - inline bool close(void){ - this->stream->flush(); - this->stream->close(); - this->stream_index.flush(); - this->stream_index.close(); - return true; - } - - inline void flush(void){ - this->flushBlock(); - this->stream_index.flush(); - this->stream->flush(); - } - - void operator<<(const entry_type* const entry){ - ++this->totempole_entry.entries; - - this->index.update(*entry, this->current_blockID, this->buffer.size() / sizeof(entry_type)); - this->buffer.Add(reinterpret_cast(&entry), sizeof(entry_type)); - - if(this->buffer.size() > this->flush_limit) - this->flushBlock(); - } - - void operator<<(const entry_type& entry){ - ++this->totempole_entry.entries; - - this->index.update(entry, this->current_blockID, this->buffer.size() / sizeof(entry_type)); - this->buffer.Add(reinterpret_cast(&entry), sizeof(entry_type)); - - if(this->buffer.size() > this->flush_limit) - this->flushBlock(); - } - - void writeHeader(std::string& literals){ - parent_type::writeHeader(literals); - this->stream_index.getNativeStream() << toi_header; - }; - - void flushBlock(void){ - if(this->buffer.size() == 0) - return; - - file_type& stream = *reinterpret_cast(this->stream); - this->controller.Deflate(this->buffer); - this->totempole_entry.byte_offset = stream.getNativeStream().tellp(); - this->stream->write(&this->controller.buffer[0], this->controller.buffer.size()); - this->totempole_entry.byte_offset_end = stream.getNativeStream().tellp(); - this->totempole_entry.uncompressed_size = this->controller.buffer.size(); - this->controller.Clear(); - this->buffer.reset(); - this->stream_index.getNativeStream() << this->totempole_entry; - this->totempole_entry.reset(); - ++this->current_blockID; - } - - index_type& getIndex(void){ return(this->index); } - - bool finalize(bool output_index){ - // Totempole Output Index - // Update blocks written - std::fstream re(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX, std::ios::in | std::ios::out | std::ios::binary); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to reopen index..." << std::endl; - return false; - } - re.seekg(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC_LENGTH + sizeof(float) + sizeof(U64) + sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to seek in index..." << std::endl; - return false; - } - - re.write((char*)¤t_blockID, sizeof(U32)); - if(!re.good()){ - std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to update counts in index..." << std::endl; - return false; - } - re.close(); - - // Write sorted index - if(output_index) - this->stream_index.getNativeStream() << this->index; - - return true; - } - -private: - U32 current_blockID; - toi_header_type toi_header; - totempole_type totempole_entry; - file_type stream_index; - index_type index; -}; - -// case natural -class TomahawkOutputWriterNatural : public TomahawkOutputWriterInterface { - typedef TomahawkOutputWriterNatural self_type; - -public: - TomahawkOutputWriterNatural(const contig_type* contigs, const header_type* header) : TomahawkOutputWriterInterface(contigs, header){} - ~TomahawkOutputWriterNatural(){ - // Flush upon termination - this->flush(); - this->writeEOF(); - this->close(); - } - - void operator<<(const entry_type* const entry){ - entry->write(*reinterpret_cast(&this->stream->getStream()), this->contigs); - } - - void operator<<(const entry_type& entry){ - entry.write(*reinterpret_cast(&this->stream->getStream()), this->contigs); - } - - void writeHeader(std::string& literals){ - const std::string header = "FLAG\tAcontigID\tAposition\tBcontigID\tBpositionID\tp1\tp2\tq1\tq2\tD\tDprime\tR2\tP\tchiSqFisher\tchiSqModel\n"; - if(literals.size() > 0) - this->stream->getStream() << literals << '\n' << header; - else - this->stream->getStream() << header; - }; - - // There is no EOF - void writeEOF(void){}; -}; - -} -} - -#endif /* TOMAHAWKOUTPUTWRITER_H_ */ diff --git a/src/tomahawk/TomahawkReader.cpp b/src/tomahawk/TomahawkReader.cpp index 5684bc2..e17217f 100644 --- a/src/tomahawk/TomahawkReader.cpp +++ b/src/tomahawk/TomahawkReader.cpp @@ -1,15 +1,16 @@ #include "TomahawkReader.h" +#include "genotype_container.h" namespace Tomahawk { // Remember to resize buffers to header.getLargestBlockSize()+64 after header is loaded TomahawkReader::TomahawkReader() : - samples(0), - version(0), filesize_(0), + offset_end_of_data_(0), bit_width_(0), dropGenotypes(false), showHeader(true), + index_(nullptr), writer(nullptr) {} @@ -17,62 +18,77 @@ TomahawkReader::~TomahawkReader(){ this->buffer_.deleteAll(); this->data_.deleteAll(); this->outputBuffer_.deleteAll(); - delete writer; + delete this->writer; + delete this->index_; } -bool TomahawkReader::Open(const std::string input){ - const std::string index = input + '.' + Tomahawk::Constants::OUTPUT_INDEX_SUFFIX; +bool TomahawkReader::open(const std::string input){ + if(input.size() == 0){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "No input filename..." << std::endl; + return false; + } + + this->stream_.open(input, std::ios::in | std::ios::binary | std::ios::ate); + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to open file handle: " << input << std::endl; + } + this->filesize_ = this->stream_.tellg(); - // Parse Totempole - if(!this->totempole_.Open(index)){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TOTEMPOLE") << "Failed build!" << std::endl; + this->stream_.seekg(this->filesize_ - TWK_FOOTER_LENGTH); + this->stream_ >> this->footer_; + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Stream corrupted after loading footer..." << std::endl; return false; } + if(this->footer_.validate() == false){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to validate footer..." << std::endl; + return false; + } + + // Seek to start of index + this->stream_.seekg(this->footer_.offset_end_of_data); + const U32 l_index_data = (this->filesize_ - TWK_FOOTER_LENGTH) - this->stream_.tellg(); + buffer_type index_buffer(l_index_data + 1024); + this->stream_.read(index_buffer.data(), l_index_data); + index_buffer.n_chars = l_index_data; + this->index_ = new index_type(index_buffer.data(), index_buffer.size()); + index_buffer.deleteAll(); + // Resize buffers to accomodate the largest possible block // without ever resizing // this is for performance reasons - this->buffer_.resize(this->totempole_.getLargestBlockSize() + 64); - this->data_.resize(this->totempole_.getLargestBlockSize() + 64); - this->outputBuffer_.resize(this->totempole_.getLargestBlockSize() + 64); + this->buffer_.resize(this->getFooter().getLargestUncompressedBlock() + 64); + this->data_.resize(this->getFooter().getLargestUncompressedBlock() + 64); + this->outputBuffer_.resize(this->getFooter().getLargestUncompressedBlock() + 64); - if(input.size() == 0){ - std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "No input filename..." << std::endl; + // Seek to beginning + this->stream_.seekg(0); + if(!this->header_.open(this->stream_)){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to load header data..." << std::endl; return false; } - this->stream_.open(input, std::ios::in | std::ios::binary | std::ios::ate); if(!this->stream_.good()){ - std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Could not open " << input << "..." << std::endl; + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Stream is bad..." << std::endl; return false; } - this->filesize_ = this->stream_.tellg(); - this->stream_.seekg(0); - // Validate MAGIC and header - if(!this->ValidateHeader(this->stream_)){ + if(this->header_.validate() == false){ std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to validate header..." << std::endl; return false; } - return(this->Validate()); -} - - -inline bool TomahawkReader::ValidateHeader(std::ifstream& in) const{ - char MAGIC[Constants::WRITE_HEADER_MAGIC_LENGTH]; - in.read(MAGIC, Constants::WRITE_HEADER_MAGIC_LENGTH); - - if(strncmp(&MAGIC[0], &Constants::WRITE_HEADER_MAGIC[0], Constants::WRITE_HEADER_MAGIC_LENGTH) == 0) - return true; + this->offset_end_of_data_ = this->footer_.offset_end_of_data; + this->DetermineBitWidth(); - return false; + return true; } bool TomahawkReader::getBlocks(void){ U64 buffer_size = 0; - for(U32 i = 0; i < this->totempole_.header.blocks; ++i){ - buffer_size += this->totempole_[i].uncompressed_size; + for(U32 i = 0; i < this->index_->getContainer().size(); ++i){ + buffer_size += this->index_->getContainer()[i].uncompressed_size; } if(buffer_size == 0){ @@ -80,14 +96,15 @@ bool TomahawkReader::getBlocks(void){ return false; } - if(!SILENT) - std::cerr << Helpers::timestamp("LOG","TOMAHAWK") << "Inflating " << this->totempole_.getHeader().blocks << " blocks into " << Helpers::ToPrettyString(buffer_size/1000) << " kb..." << std::endl; + //if(!SILENT) + // std::cerr << Helpers::timestamp("LOG","TOMAHAWK") << "Inflating " << this->totempole_.getHeader().getNumberBlocks() << " blocks into " << Helpers::ToPrettyString(buffer_size/1000) << " kb..." << std::endl; this->data_.resize(buffer_size + 1000); - for(U32 i = 0; i < this->totempole_.getBlocks(); ++i) + for(U32 i = 0; i < this->index_->getContainer().size(); ++i){ if(!this->getBlock(i)) return false; + } return true; } @@ -95,8 +112,8 @@ bool TomahawkReader::getBlocks(void){ bool TomahawkReader::getBlocks(std::vector& blocks){ U64 buffer_size = 0; for(U32 i = 0; i < blocks.size(); ++i){ - std::cerr << i << '/' << blocks.size() << '\t' << this->totempole_[i].uncompressed_size << std::endl; - buffer_size += this->totempole_[i].uncompressed_size; + std::cerr << i << '/' << blocks.size() << '\t' << this->index_->getContainer()[i].uncompressed_size << std::endl; + buffer_size += this->index_->getContainer()[i].uncompressed_size; } if(buffer_size == 0) @@ -118,7 +135,7 @@ bool TomahawkReader::getBlocks(std::vector< std::pair >& blocks){ U64 buffer_size = 0; for(U32 i = 0; i < blocks.size(); ++i){ for(U32 j = blocks[i].first; j < blocks[i].second; ++j){ - buffer_size += this->totempole_[j].uncompressed_size; + buffer_size += this->index_->getContainer()[j].uncompressed_size; } } @@ -154,26 +171,26 @@ bool TomahawkReader::getBlock(const U32 blockID){ return false; } - this->stream_.seekg(this->totempole_[blockID].byte_offset); + this->stream_.seekg(this->index_->getContainer()[blockID].byte_offset); if(!this->stream_.good()){ std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed search..." << std::endl; return false; } - const U32 readLength = this->totempole_[blockID].byte_offset_end - this->totempole_[blockID].byte_offset; + const U32 readLength = this->index_->getContainer()[blockID].byte_offset_end - this->index_->getContainer()[blockID].byte_offset; if(readLength > this->buffer_.capacity()){ std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Overflowing capacity: " << readLength << '/' << this->buffer_.capacity() << std::endl; exit(1); } - this->blockDataOffsets_.push_back(DataOffsetPair(&this->data_.data[this->data_.pointer], this->totempole_[blockID])); - if(!this->stream_.read(&this->buffer_.data[0], readLength)){ + this->blockDataOffsets_.push_back(DataOffsetPair(&this->data_[this->data_.size()], this->index_->getContainer()[blockID].uncompressed_size, this->index_->getContainer()[blockID])); + if(!this->stream_.read(this->buffer_.data(), readLength)){ std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed read: " << this->stream_.good() << '\t' << this->stream_.fail() << '/' << this->stream_.eof() << std::endl; //std::cerr << this->stream_.gcount() << '/' << readLength << std::endl; return false; } - this->buffer_.pointer = readLength; + this->buffer_.n_chars = readLength; if(!this->tgzf_controller_.Inflate(this->buffer_, this->data_)){ std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to inflate data..." << std::endl; @@ -183,60 +200,34 @@ bool TomahawkReader::getBlock(const U32 blockID){ return true; } -bool TomahawkReader::Validate(void){ - char temp_buffer[sizeof(float)+sizeof(U64)]; - this->stream_.read(&temp_buffer[0], sizeof(float)+sizeof(U64)); - - const float* version = reinterpret_cast(&temp_buffer[0]); - const U64* samples = reinterpret_cast(&temp_buffer[sizeof(float)]); - - this->version = *version; - this->samples = *samples; - - if(this->version != this->totempole_.getHeader().version){ - std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "File discordance: versions do not match..." << std::endl; - return false; - } - - if(this->samples != this->totempole_.getHeader().samples){ - std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "File discordance:number of samples do not match" << std::endl; - return false; - } - - // Determine what bit-width functions to use - this->DetermineBitWidth(); - - return true; -} - void TomahawkReader::DetermineBitWidth(void){ - if(this->samples <= Constants::UPPER_LIMIT_SAMPLES_8B - 1){ + if(this->header_.magic_.getNumberSamples() <= Constants::UPPER_LIMIT_SAMPLES_8B - 1){ if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " < " << Constants::UPPER_LIMIT_SAMPLES_8B << "..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " < " << Constants::UPPER_LIMIT_SAMPLES_8B << "..." << std::endl; std::cerr << Helpers::timestamp("LOG", "RLE") << "Using 8-bit width..." << std::endl; } this->bit_width_ = sizeof(BYTE); - } else if(this->samples <= Constants::UPPER_LIMIT_SAMPLES_16B - 1){ + } else if(this->header_.magic_.getNumberSamples() <= Constants::UPPER_LIMIT_SAMPLES_16B - 1){ if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " > " << Constants::UPPER_LIMIT_SAMPLES_8B << "... Skip" << std::endl; - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " < " << Constants::UPPER_LIMIT_SAMPLES_16B << "..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " > " << Constants::UPPER_LIMIT_SAMPLES_8B << "... Skip" << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " < " << Constants::UPPER_LIMIT_SAMPLES_16B << "..." << std::endl; std::cerr << Helpers::timestamp("LOG", "RLE") << "Using 16-bit width..." << std::endl; } this->bit_width_ = sizeof(U16); - } else if(this->samples <= Constants::UPPER_LIMIT_SAMPLES_32B - 1){ + } else if(this->header_.magic_.getNumberSamples() <= Constants::UPPER_LIMIT_SAMPLES_32B - 1){ if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " > " << Constants::UPPER_LIMIT_SAMPLES_8B << "... Skip" << std::endl; - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " > " << Constants::UPPER_LIMIT_SAMPLES_16B << "... Skip" << std::endl; - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " < " << Constants::UPPER_LIMIT_SAMPLES_32B << "..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " > " << Constants::UPPER_LIMIT_SAMPLES_8B << "... Skip" << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " > " << Constants::UPPER_LIMIT_SAMPLES_16B << "... Skip" << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " < " << Constants::UPPER_LIMIT_SAMPLES_32B << "..." << std::endl; std::cerr << Helpers::timestamp("LOG", "RLE") << "Using 32-bit width..." << std::endl; } this->bit_width_ = sizeof(U32); } else { if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " > " << Constants::UPPER_LIMIT_SAMPLES_8B << "... Skip" << std::endl; - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " > " << Constants::UPPER_LIMIT_SAMPLES_16B << "... Skip" << std::endl; - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " > " << Constants::UPPER_LIMIT_SAMPLES_32B << "... Skip" << std::endl; - std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->samples << " < " << Constants::UPPER_LIMIT_SAMPLES_64B << "..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " > " << Constants::UPPER_LIMIT_SAMPLES_8B << "... Skip" << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " > " << Constants::UPPER_LIMIT_SAMPLES_16B << "... Skip" << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " > " << Constants::UPPER_LIMIT_SAMPLES_32B << "... Skip" << std::endl; + std::cerr << Helpers::timestamp("LOG", "RLE") << "Samples: " << this->header_.magic_.getNumberSamples() << " < " << Constants::UPPER_LIMIT_SAMPLES_64B << "..." << std::endl; std::cerr << Helpers::timestamp("LOG", "RLE") << "Using 64-bit width..." << std::endl; } this->bit_width_ = sizeof(U64); @@ -260,7 +251,7 @@ bool TomahawkReader::outputBlocks(std::vector& blocks){ // Output header if(this->showHeader) - std::cout << this->totempole_.literals + "\n##tomahawk_viewCommand=" + Helpers::program_string() << std::endl; + std::cout << this->header_.getLiterals() + "\n##tomahawk_viewCommand=" + Helpers::program_string() << std::endl; for(U32 i = 0; i < blocks.size(); ++i) @@ -285,22 +276,26 @@ bool TomahawkReader::outputBlocks(){ } if(!SILENT) - std::cerr << Helpers::timestamp("LOG", "TGZF") << "Inflating " << this->totempole_.getBlocks() << " blocks..." << std::endl; + std::cerr << Helpers::timestamp("LOG", "TGZF") << "Inflating " << this->index_->getContainer().size() << " blocks..." << std::endl; // Output header if(this->showHeader){ - std::cout << this->totempole_.literals << std::endl; + std::cout << this->header_.getLiterals() << std::endl; std::cout << "##INFO=" << std::endl; std::cout << "##INFO=" << std::endl; std::cout << "##tomahawk_viewCommand=" + Helpers::program_string() << std::endl; std::cout << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"; - for(U32 i = 0; i < this->totempole_.header.samples - 1; ++i) - std::cout << this->totempole_.samples[i] << '\t'; - std::cout << this->totempole_.samples[this->totempole_.header.samples - 1] << std::endl; + + std::cout << this->header_.getSample(0); + for(U32 i = 1; i < this->header_.magic_.getNumberSamples(); ++i) + std::cout << '\t' << this->header_.getSample(i); + std::cout.put('\n'); + } - for(U32 i = 0; i < this->totempole_.getBlocks(); ++i) + for(U32 i = 0; i < this->index_->getContainer().size(); ++i){ (*this.*func__)(i); + } return true; } diff --git a/src/tomahawk/TomahawkReader.h b/src/tomahawk/TomahawkReader.h index 0369982..610214f 100644 --- a/src/tomahawk/TomahawkReader.h +++ b/src/tomahawk/TomahawkReader.h @@ -6,31 +6,41 @@ #include #include +#include "../algorithm/load_balancer_ld.h" +#include "../interface/progressbar.h" #include "../support/MagicConstants.h" #include "../io/compression/TGZFController.h" #include "../io/compression/GZFConstants.h" -#include "../interface/Timer.h" -#include "../interface/ProgressBar.h" -#include "../algorithm/Balancer.h" +#include "../interface/timer.h" +#include "meta_entry.h" +#include "twk_reader_implementation.h" +#include "ld_calculation_slave.h" #include "TomahawkCalcParameters.h" -#include "base/TomahawkEntryMeta.h" -#include "TomahawkCalculateSlave.h" +#include "../index/index.h" +#include "genotype_container_reference.h" +#include "../index/tomahawk_header.h" namespace Tomahawk { // TomahawkReader class simply reads compressed data from disk class TomahawkReader { - typedef TomahawkCalcParameters parameter_type; - typedef Totempole::TotempoleEntry totempole_entry; + typedef TomahawkCalcParameters parameter_type; + typedef Totempole::IndexEntry totempole_entry; + typedef TomahawkHeader header_type; + typedef Index index_type; + typedef IO::BasicBuffer buffer_type; + typedef IO::TGZFController tgzf_controller_type; + typedef Totempole::Footer footer_type; public: // Used to keep track of char pointer offsets in buffer // and what Totempole entry is associated with that position struct DataOffsetPair{ - DataOffsetPair(const char* data, const totempole_entry& entry) : entry(entry), data(data){} + DataOffsetPair(const char* data, const U64 l_buffer, const totempole_entry& entry) : entry(entry), l_buffer(l_buffer), data(data){} ~DataOffsetPair(){} const totempole_entry& entry; + const U64 l_buffer; const char* data; }; @@ -38,7 +48,7 @@ class TomahawkReader { TomahawkReader(); ~TomahawkReader(); - bool Open(const std::string input); + bool open(const std::string input); // Reader functions bool getBlocks(void); @@ -46,14 +56,24 @@ class TomahawkReader { bool getBlocks(std::vector< std::pair >& blocks); bool getBlock(const U32 blockID); + // Accessors + inline footer_type& getFooter(void){ return(this->footer_); } + inline const footer_type& getFooter(void) const{ return(this->footer_); } + inline const index_type& getIndex(void) const{ return(*this->index_); } + inline index_type& getIndex(void){ return(*this->index_); } + inline const header_type& getHeader(void) const{ return(this->header_); } + inline header_type& getHeader(void){ return(this->header_); } + inline index_type* getIndexPointer(void){ return(this->index_); } + // Output functions bool outputBlocks(std::vector& blocks); bool outputBlocks(); + inline const BYTE& getBitWidth(void) const{ return(this->bit_width_); } - inline Totempole::TotempoleReader& getTotempole(void){ return(this->totempole_); } inline const DataOffsetPair& getOffsetPair(const U32 p) const{ return(this->blockDataOffsets_[p]); } inline const size_t DataOffsetSize(void) const{ return(this->blockDataOffsets_.size()); } + inline void setDropGenotypes(const bool yes){ this->dropGenotypes = yes; } inline void setShowHeader(const bool yes){ this->showHeader = yes; } @@ -61,24 +81,23 @@ class TomahawkReader { void DetermineBitWidth(void); template bool outputBlock(const U32 blockID); template bool WriteBlock(const char* data, const U32 blockID); - bool Validate(void); - bool ValidateHeader(std::ifstream& in) const; private: - U64 samples; // has to match header - float version; // has to match header - U64 filesize_; // filesize - BYTE bit_width_; // bit width - bool dropGenotypes; - bool showHeader; // flag to output header or not - std::ifstream stream_; // reader stream - Totempole::TotempoleReader totempole_; - - - IO::BasicBuffer buffer_; // input buffer - IO::BasicBuffer data_; // inflate buffer - IO::BasicBuffer outputBuffer_; // output buffer - IO::TGZFController tgzf_controller_; + U64 filesize_; // filesize + U64 offset_end_of_data_; + BYTE bit_width_; // bit width + bool dropGenotypes; + bool showHeader; // flag to output header or not + std::ifstream stream_; // reader stream + + header_type header_; + footer_type footer_; + index_type* index_; + + buffer_type buffer_; // input buffer + buffer_type data_; // inflate buffer + buffer_type outputBuffer_; // output buffer + tgzf_controller_type tgzf_controller_; std::vector blockDataOffsets_; @@ -92,19 +111,14 @@ bool TomahawkReader::outputBlock(const U32 blockID){ return false; } - //std::cerr << "getblock " << blockID << " seek to " << this->totempole_[blockID].byte_offset << std::endl; - this->stream_.seekg(this->totempole_[blockID].byte_offset); + this->stream_.seekg(this->index_->getContainer()[blockID].byte_offset); if(!this->stream_.good()){ std::cerr << Helpers::timestamp("ERROR", "TWK") << "Failed search..." << std::endl; return false; } // Determine byte-width of data - U32 readLength = 0; - if(blockID != this->totempole_.getBlocks() - 1) - readLength = this->totempole_[blockID + 1].byte_offset - this->totempole_[blockID].byte_offset; - else - readLength = this->filesize_ - Constants::eof_length*sizeof(U64) - this->totempole_[this->totempole_.getBlocks()-1].byte_offset; + U32 readLength = this->index_->getContainer()[blockID].byte_offset_end - this->index_->getContainer()[blockID].byte_offset; if(readLength > this->buffer_.capacity()){ std::cerr << Helpers::timestamp("ERROR", "TWK") << "Impossible: " << readLength << '/' << this->buffer_.capacity() << std::endl; @@ -112,52 +126,81 @@ bool TomahawkReader::outputBlock(const U32 blockID){ } // Read from start to start + byte-width - if(!this->stream_.read(&this->buffer_.data[0], readLength)){ + if(!this->stream_.read(this->buffer_.data(), readLength)){ std::cerr << Helpers::timestamp("ERROR", "TWK") << "Failed read: " << this->stream_.good() << '\t' << this->stream_.fail() << '/' << this->stream_.eof() << std::endl; std::cerr << this->stream_.gcount() << '/' << readLength << std::endl; return false; } // Set buffer byte-width to data loaded - this->buffer_.pointer = readLength; + this->buffer_.n_chars = readLength; // Keep track of position because inflate function moves pointer - char* data_position = &this->data_.data[this->data_.pointer]; + char* data_position = &this->data_[this->data_.size()]; // Inflate TGZF block if(!this->tgzf_controller_.Inflate(this->buffer_, this->data_)){ std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to inflate DATA..." << std::endl; return false; } - - // Todo: move to function this->WriteBlock(data_position, blockID); return true; } template -bool TomahawkReader::WriteBlock(const char* data, const U32 blockID){ - TomahawkBlock tomahawk_controller(data, this->totempole_[blockID]); +bool TomahawkReader::WriteBlock(const char* const data, const U32 blockID){ + Base::GenotypeContainerReference o(data, + this->index_->getContainer()[blockID].uncompressed_size, + this->index_->getContainer()[blockID], + this->header_.magic_.getNumberSamples(), + false); // For each variant in Tomahawk block - for(U32 j = 0; j < tomahawk_controller.support->variants; ++j){ - tomahawk_controller.WriteVariant(this->totempole_, this->outputBuffer_, this->dropGenotypes); - - // Next variant - ++tomahawk_controller; - - // Keep flushing regularly + for(U32 j = 0; j < o.size(); ++j){ + const char separator = o.currentMeta().getPhaseVCFCharacter(); + + this->outputBuffer_ += this->header_.contigs_[this->index_->getContainer()[blockID].contigID].name; + this->outputBuffer_ += '\t'; + this->outputBuffer_ += std::to_string(o.currentMeta().position); + this->outputBuffer_ += "\t.\t"; + this->outputBuffer_ += o.currentMeta().getRefAllele(); + this->outputBuffer_ += '\t'; + this->outputBuffer_ += o.currentMeta().getAltAllele(); + this->outputBuffer_ += "\t.\t.\t"; + this->outputBuffer_ += "HWE_P="; + this->outputBuffer_ += std::to_string(o.currentMeta().HWE_P); + this->outputBuffer_ += ";MAF="; + this->outputBuffer_ += std::to_string(o.currentMeta().MAF); + if(this->dropGenotypes == false){ + this->outputBuffer_ += "\tGT\t"; + for(U32 i = 0; i < o.currentMeta().runs; ++i){ + if(i != 0) this->outputBuffer_ += '\t'; + const char& left = Constants::TOMAHAWK_ALLELE_LOOKUP_REVERSE[o[i].alleleA]; + const char& right = Constants::TOMAHAWK_ALLELE_LOOKUP_REVERSE[o[i].alleleB]; + this->outputBuffer_ += left; + this->outputBuffer_ += separator; + this->outputBuffer_ += right; + for(U32 r = 1; r < o[i].runs; ++r){ + this->outputBuffer_ += '\t'; + this->outputBuffer_ += left; + this->outputBuffer_ += separator; + this->outputBuffer_ += right; + } + } + } + this->outputBuffer_ += '\n'; + ++o; if(this->outputBuffer_.size() > 65536){ - //this->writer->write(&this->outputBuffer_.data[0], this->outputBuffer_.pointer); - std::cout.write(&this->outputBuffer_.data[0], this->outputBuffer_.pointer); + //this->writer->write(this->outputBuffer_.data(), this->outputBuffer_.n_chars); + std::cout.write(this->outputBuffer_.data(), this->outputBuffer_.n_chars); this->outputBuffer_.reset(); } } // Flush last - //this->writer->write(&this->outputBuffer_.data[0], this->outputBuffer_.pointer); - std::cout.write(&this->outputBuffer_.data[0], this->outputBuffer_.pointer); + //this->writer->write(this->outputBuffer_.data(), this->outputBuffer_.n_chars); + std::cout.write(this->outputBuffer_.data(), this->outputBuffer_.n_chars); // Reset buffers this->outputBuffer_.reset(); // reset diff --git a/src/tomahawk/genotype_bitvector.h b/src/tomahawk/genotype_bitvector.h new file mode 100644 index 0000000..80e0052 --- /dev/null +++ b/src/tomahawk/genotype_bitvector.h @@ -0,0 +1,66 @@ +#ifndef TOMAHAWK_BASE_GENOTYPE_BITVECTOR_H_ +#define TOMAHAWK_BASE_GENOTYPE_BITVECTOR_H_ + +#include "../support/simd_definitions.h" + +namespace Tomahawk{ +namespace Base{ + +/**< + * Data structure to representing a 1-bit allele + * representation of genotypes. This data structure + * has to be aligned to `SIMD_ALIGNMENT` as specified + * by the user CPU architecture. If no SIMD is available + * on the device then use regular memory alignment. + * + * Special techniques to accelerate pairwise comparisons: + * 1) Front and tail number of SIMD _elements_ (e.g. 128 bits / 16 bytes) + * that are either all 0 or 1. This allows the algorithm + * to either completely skip these stretches or + * resort to cheaper comparison functors. + * 2) Counts of missingness needs to be maintained for these + * tail and head elements to function correctly. + */ +template +struct GenotypeBitvector{ +public: + GenotypeBitvector(const U32 size): + frontZero(0), + tailZero(0), + frontZeroMissing(0), + tailZeroMissing(0), + #if SIMD_AVAILABLE == 1 + data((BYTE*)_mm_malloc(size, T)), + mask((BYTE*)_mm_malloc(size, T)) + #else + data(new BYTE[size]), + mask(new BYTE[size]) + #endif + { + memset(this->data, 0, size); + memset(this->mask, 0, size); + } + + ~GenotypeBitvector(){ + #if SIMD_AVAILABLE == 1 + _mm_free(this->data); + _mm_free(this->mask); + #else + delete [] this->data; + delete [] this->mask; + #endif + } + +public: + U16 frontZero; // leading zeros in aligned vector width + U16 tailZero; // trailing zeros in aligned vector width + U16 frontZeroMissing; // number of missing values in leading zeros + U16 tailZeroMissing; // number of missing values in trailing zeros + BYTE* data; + BYTE* mask; +} __attribute__((aligned(16))); + +} +} + +#endif /* TOMAHAWK_BASE_GENOTYPE_BITVECTOR_H_ */ diff --git a/src/tomahawk/genotype_container.h b/src/tomahawk/genotype_container.h new file mode 100644 index 0000000..302023a --- /dev/null +++ b/src/tomahawk/genotype_container.h @@ -0,0 +1,146 @@ +#ifndef TOMAHAWK_BASE_GENOTYPE_CONTAINER_H_ +#define TOMAHAWK_BASE_GENOTYPE_CONTAINER_H_ + +#include // size_t, ptrdiff_t +#include // forward_iterator_tag + +#include "../support/type_definitions.h" +#include "../index/index_entry.h" +#include "genotype_container_bitvector.h" +#include "genotype_container_runlength.h" + +namespace Tomahawk{ +namespace Base{ + +template +class GenotypeContainer{ +private: + typedef GenotypeContainer self_type; + typedef GenotypeContainerBitvector container_bitvector_type; + typedef GenotypeContainerRunlength container_runlength_type; + typedef GenotypeContainerRunlengthObjects genotype_runlength_type; + typedef Base::GenotypeBitvector<> genotype_bitvector_type; + typedef MetaEntry meta_type; + typedef Totempole::IndexEntry header_entry; + typedef genotype_runlength_type value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + +public: + GenotypeContainer(const char* const data_buffer, const size_t l_buffer_length, const header_entry& support, const U64& n_samples) : + n_entries(support.n_variants), + index_entry(support), // invoke copy ctor + meta_entries(static_cast(::operator new[](this->n_entries*sizeof(meta_type)))), + container_runlength(nullptr), + container_bitvector(nullptr) + { + if(l_buffer_length == 0) + return; + + // Interpret meta entries + size_t cumulative_position = 0; + size_t genotype_cost = 0; + + for(size_t i = 0; i < this->size(); ++i){ + new( &this->meta_entries[i] ) meta_type( &data_buffer[cumulative_position] ); + cumulative_position += TOMAHAWK_ENTRY_META_SIZE + sizeof(T); + genotype_cost += meta_entries[i].runs*sizeof(T); + } + assert(cumulative_position + genotype_cost == l_buffer_length); + + // Interpret run lengths + this->container_runlength = new container_runlength_type(&data_buffer[cumulative_position], l_buffer_length - cumulative_position, this->size(), this->meta_entries); + + // Interpret bit vectors + this->container_bitvector = new container_bitvector_type(*this->container_runlength, n_samples); + } + + ~GenotypeContainer(){ + for(size_type i = 0; i < this->size(); ++i) + ((this->meta_entries + i)->~MetaEntry)(); + + ::operator delete[](static_cast(this->meta_entries)); + delete this->container_runlength; + delete this->container_bitvector; + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->container_runlength->at(position)); } + inline const_reference at(const size_type& position) const{ return(this->container_runlength->at(position)); } + inline reference operator[](const size_type& position){ return(this->container_runlength->at(position)); } + inline const_reference operator[](const size_type& position) const{ return(this->container_runlength->at(position)); } + inline pointer data(void){ return(this->container_runlength); } + inline const_pointer data(void) const{ return(this->container_runlength); } + inline reference front(void){ return(this->container_runlength->at(0)); } + inline const_reference front(void) const{ return(this->container_runlength->at(0)); } + inline reference back(void){ return(this->container_runlength->at(this->n_entries - 1)); } + inline const_reference back(void) const{ return(this->container_runlength->at(this->n_entries - 1)); } + + // Accessor + inline const meta_type& getMeta(const U32& position) const{ return(this->meta_entries[position]); } + inline const header_entry& getTotempole(void) const{ return(this->index_entry); } + inline const genotype_bitvector_type& getBitvector(const U32& position) const{ return(this->container_bitvector->at(position)); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + + // Iterator + inline iterator begin(){ return iterator(&this->container_runlength[0]); } + inline iterator end() { return iterator(&this->container_runlength[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->container_runlength[0]); } + inline const_iterator end() const{ return const_iterator(&this->container_runlength[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->container_runlength[0]); } + inline const_iterator cend() const{ return const_iterator(&this->container_runlength[this->n_entries - 1]); } + +private: + size_type n_entries; + header_entry index_entry; + meta_type* meta_entries; + container_runlength_type* container_runlength; + container_bitvector_type* container_bitvector; +}; + +} +} + + +#endif /* TOMAHAWK_BASE_GENOTYPE_CONTAINER_H_ */ diff --git a/src/tomahawk/genotype_container_bitvector.h b/src/tomahawk/genotype_container_bitvector.h new file mode 100644 index 0000000..4a295fb --- /dev/null +++ b/src/tomahawk/genotype_container_bitvector.h @@ -0,0 +1,313 @@ +#ifndef TOMAHAWK_BASE_GENOTYPE_CONTAINER_BITVECTOR_H_ +#define TOMAHAWK_BASE_GENOTYPE_CONTAINER_BITVECTOR_H_ + +#include "../algorithm/genotype_bitpacker.h" +#include "genotype_bitvector.h" +#include "genotype_container_runlength.h" +#include "genotype_objects.h" + +namespace Tomahawk{ +namespace Base{ + +/**< + * Meta container for bit-packed blocks of genotypes + */ +class GenotypeContainerBitvector{ +private: + typedef GenotypeContainerBitvector self_type; + typedef Base::GenotypeBitvector<> value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef Totempole::IndexEntry support_type; + +public: + GenotypeContainerBitvector() : n_entries(0), n_capacity(0), __entries(nullptr){} + ~GenotypeContainerBitvector(){ + // Cleanup + for(std::size_t i = 0; i < this->n_entries; ++i) + ((this->__entries + i)->~GenotypeBitvector)(); + + ::operator delete[](static_cast(this->__entries)); + } + + GenotypeContainerBitvector(const GenotypeContainerRunlength& genotype_container, const U64& n_samples) : + n_entries(0), + n_capacity(this->n_entries), + __entries(nullptr) + { + this->Build(genotype_container, n_samples); + } + + GenotypeContainerBitvector(const GenotypeContainerRunlength& genotype_container, const U64& n_samples) : + n_entries(0), + n_capacity(this->n_entries), + __entries(nullptr) + { + this->Build(genotype_container, n_samples); + } + + GenotypeContainerBitvector(const GenotypeContainerRunlength& genotype_container, const U64& n_samples) : + n_entries(0), + n_capacity(this->n_entries), + __entries(nullptr) + { + this->Build(genotype_container, n_samples); + } + + GenotypeContainerBitvector(const GenotypeContainerRunlength& genotype_container, const U64& n_samples) : + n_entries(0), + n_capacity(this->n_entries), + __entries(nullptr) + { + this->Build(genotype_container, n_samples); + } + + // copy constructor + GenotypeContainerBitvector(const self_type& other) : + n_entries(other.n_entries), + n_capacity(other.n_capacity), + __entries(other.__entries) + { + + } + + // move constructor + GenotypeContainerBitvector(self_type&& other) noexcept : + n_entries(other.n_entries), + n_capacity(other.n_capacity), + __entries(other.__entries) + { + other.__entries = nullptr; + } + + GenotypeContainerBitvector& operator=(self_type&& other) noexcept{ + // prevent self-move + if(this != &other){ + this->n_entries = other.n_entries; + this->n_capacity = other.n_capacity; + // swap + } + return *this; + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->__entries[position]); } + inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } + inline reference operator[](const size_type& position){ return(this->__entries[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->__entries[position]); } + inline pointer data(void){ return(this->__entries); } + inline const_pointer data(void) const{ return(this->__entries); } + inline reference front(void){ return(this->__entries[0]); } + inline const_reference front(void) const{ return(this->__entries[0]); } + inline reference back(void){ return(this->__entries[this->n_entries - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + inline const size_type& capacity(void) const{ return(this->n_capacity); } + + // Iterator + inline iterator begin(){ return iterator(&this->__entries[0]); } + inline iterator end() { return iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + + // Internal construction function + template + bool Build(const GenotypeContainerRunlength& genotype_container, const U64& n_samples); + + template + bool Build(const Support::GenotypeDiploidRun* const genotype_runs, const MetaEntry* const meta_entries, const size_t& n_entries, const U64& n_samples); + +public: + size_type n_entries; + size_type n_capacity; + pointer __entries; +}; + +template +bool GenotypeContainerBitvector::Build(const GenotypeContainerRunlength& genotype_container, + const U64& n_samples) +{ + if(genotype_container.size() == 0) + return false; + + // Cleanup + for(std::size_t i = 0; i < this->n_entries; ++i) + ((this->__entries + i)->~GenotypeBitvector)(); + + ::operator delete[](static_cast(this->__entries)); + + // Allocate new + this->n_entries = genotype_container.size(); + this->n_capacity = this->n_entries; + this->__entries = static_cast(::operator new[](this->n_entries*sizeof(value_type))); + + const U32 byte_width = ceil((double)n_samples/4); + + // INVERSE mask is cheaper in terms of instructions used + // exploited in calculations: TomahawkCalculationSlave + const BYTE lookup_mask[16] = {0, 0, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; + const BYTE lookup_data[16] = {0, 1, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // Cycle over variants in container + for(U32 i = 0; i < genotype_container.size(); ++i){ + new( &this->__entries[i] ) value_type( byte_width ); + Algorithm::GenotypeBitPacker packerA(this->__entries[i].data, 2); + Algorithm::GenotypeBitPacker packerB(this->__entries[i].mask, 2); + + // Cycle over runs in container + for(U32 j = 0; j < genotype_container[i].size(); ++j){ + const Support::GenotypeDiploidRunPacked* const packed = reinterpret_cast* const>(&genotype_container[i][j]); + packerA.add(lookup_data[packed->alleles], packed->runs); + packerB.add(lookup_mask[packed->alleles], packed->runs); + } + } + + const U32 byteAlignedEnd = byte_width / (GENOTYPE_TRIP_COUNT/4) * (GENOTYPE_TRIP_COUNT/4); + + // Search for zero runs in either end + for(U32 i = 0; i < genotype_container.size(); ++i){ + S32 j = 0; + + // Search from left->right + for(; j < byteAlignedEnd; ++j){ + if(this->__entries[i].data[j] != 0 || this->__entries[i].mask[j] != 0) + break; + } + + // Front of zeroes + this->__entries[i].frontZero = ((j - 1 < 0 ? 0 : j - 1)*4)/GENOTYPE_TRIP_COUNT; + if(j == byteAlignedEnd) + continue; + + j = byteAlignedEnd - 1; + for(; j > 0; --j){ + if(this->__entries[i].data[j] != 0 || this->__entries[i].mask[j] != 0) + break; + } + + // Tail of zeroes + this->__entries[i].tailZero = ((byteAlignedEnd - (j+1))*4)/GENOTYPE_TRIP_COUNT; + } + return true; +} + +template +bool GenotypeContainerBitvector::Build(const Support::GenotypeDiploidRun* const genotype_runs, + const MetaEntry* const meta_entries, + const size_t& n_entries, + const U64& n_samples) +{ + if(n_entries == 0) + return false; + + // Cleanup + for(std::size_t i = 0; i < this->n_entries; ++i) + ((this->__entries + i)->~GenotypeBitvector)(); + + ::operator delete[](static_cast(this->__entries)); + + // Allocate new + this->n_entries = n_entries; + this->n_capacity = this->n_entries; + this->__entries = static_cast(::operator new[](this->n_entries*sizeof(value_type))); + + const U32 byte_width = ceil((double)n_samples/4); + + // INVERSE mask is cheaper in terms of instructions used + // exploited in calculations: TomahawkCalculationSlave + const BYTE lookup_mask[16] = {0, 0, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; + const BYTE lookup_data[16] = {0, 1, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // Cycle over variants in container + U32 cumulative_position = 0; + for(U32 i = 0; i < n_entries; ++i){ + new( &this->__entries[i] ) value_type( byte_width ); + Algorithm::GenotypeBitPacker packerA(this->__entries[i].data, 2); + Algorithm::GenotypeBitPacker packerB(this->__entries[i].mask, 2); + + // Cycle over runs in container + for(U32 j = 0; j < meta_entries[i].runs; ++j, cumulative_position++){ + const Support::GenotypeDiploidRunPacked* const packed = reinterpret_cast* const>(&genotype_runs[cumulative_position]); + packerA.add(lookup_data[packed->alleles], packed->runs); + packerB.add(lookup_mask[packed->alleles], packed->runs); + } + } + + const U32 byteAlignedEnd = byte_width / (GENOTYPE_TRIP_COUNT/4) * (GENOTYPE_TRIP_COUNT/4); + + // Search for zero runs in either end + for(U32 i = 0; i < n_entries; ++i){ + S32 j = 0; + + // Search from left->right + for(; j < byteAlignedEnd; ++j){ + if(this->__entries[i].data[j] != 0 || this->__entries[i].mask[j] != 0) + break; + } + + // Front of zeroes + this->__entries[i].frontZero = ((j - 1 < 0 ? 0 : j - 1)*4)/GENOTYPE_TRIP_COUNT; + if(j == byteAlignedEnd) + continue; + + j = byteAlignedEnd - 1; + for(; j > 0; --j){ + if(this->__entries[i].data[j] != 0 || this->__entries[i].mask[j] != 0) + break; + } + + // Tail of zeroes + this->__entries[i].tailZero = ((byteAlignedEnd - (j+1))*4)/GENOTYPE_TRIP_COUNT; + } + return true; +} + +} +} + + + +#endif /* TOMAHAWK_BASE_GENOTYPE_CONTAINER_BITVECTOR_H_ */ diff --git a/src/tomahawk/genotype_container_reference.h b/src/tomahawk/genotype_container_reference.h new file mode 100644 index 0000000..fbb0646 --- /dev/null +++ b/src/tomahawk/genotype_container_reference.h @@ -0,0 +1,139 @@ +#ifndef TOMAHAWK_BASE_GENOTYPE_CONTAINER_REFERENCE_H_ +#define TOMAHAWK_BASE_GENOTYPE_CONTAINER_REFERENCE_H_ + +#include // size_t, ptrdiff_t + +#include "../support/type_definitions.h" +#include "../index/index_entry.h" +#include "genotype_container_bitvector.h" + +namespace Tomahawk{ +namespace Base{ + +/**< + * Special genotype container for both run-length encoded + * and bit-encoded genotypes using unaligned memory directly + * interpreted from type-casts at compile time and has some + * psuedo-iterator capabilities. + * This works because there is no random access in the pairwise + * comparator functions. Upper triangular comparisons can be done + * by strictly iterating forward. + * + * Data can only be interpreted by invoking the standard constructor. + * All other constructors have non-standard meaning. + * 1) Copy constructor: copies the pointer addresses and iterator + * positions only. + * 2) Assignment operator: copies the iterator position only + */ +template +class GenotypeContainerReference{ +private: + typedef GenotypeContainerReference self_type; + +protected: + typedef Totempole::IndexEntry header_entry_type; + typedef GenotypeContainerBitvector container_bitvector_type; + typedef Base::GenotypeBitvector<> genotype_bitvector_type; + typedef Support::GenotypeDiploidRun value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef MetaEntry meta_type; + +public: + GenotypeContainerReference() : + n_entries(0), + iterator_position_meta(0), + iterator_position_runs(0), + owns_bitvectors(true), + meta_entries(nullptr), + genotype_entries(nullptr), + index_entry(nullptr), + bit_vectors(nullptr) + { + } + + GenotypeContainerReference(const char* const data, const size_t l_data, const header_entry_type& index_entry, const size_t n_samples, const bool build_bitvectors = true) : + n_entries(index_entry.n_variants), + iterator_position_meta(0), + iterator_position_runs(0), + owns_bitvectors(true), + meta_entries(reinterpret_cast(data)), + genotype_entries(reinterpret_cast(&data[this->n_entries * (TOMAHAWK_ENTRY_META_SIZE + sizeof(T))])), + index_entry(&index_entry), + bit_vectors(nullptr) + { + if(build_bitvectors){ + this->bit_vectors = new container_bitvector_type(); + this->bit_vectors->Build(this->genotype_entries, this->meta_entries, this->size(), n_samples); + } + } + + // Copy ctor: copies iterator positions and pointers + GenotypeContainerReference(const self_type& other) : + n_entries(other.n_entries), + iterator_position_meta(other.iterator_position_meta), + iterator_position_runs(other.iterator_position_runs), + owns_bitvectors(false), + meta_entries(other.meta_entries), + genotype_entries(other.genotype_entries), + index_entry(other.index_entry), + bit_vectors(other.bit_vectors) + { + + } + + ~GenotypeContainerReference(){ + if(this->owns_bitvectors) + delete this->bit_vectors; + } + + // // copy pointers only! + void operator=(const self_type& other){ + this->iterator_position_runs = other.iterator_position_runs; + this->iterator_position_meta = other.iterator_position_meta; + } + + // Accessor + inline const header_entry_type& getTotempole(void) const{ return(*this->index_entry); } + inline const genotype_bitvector_type& getBitvector(const U32& position) const{ return(this->bit_vectors->at(position)); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + inline void resetIterator(void){ this->iterator_position_runs = 0; this->iterator_position_meta = 0; } + + // Accessor + inline const char* const data(void) const{ return(this->genotype_entries); } + inline const char* const meta_data(void) const{ return(this->meta_entries); } + + inline const meta_type& getMeta(const U32& position) const{ return(this->meta_entries[position]); } + inline const meta_type& currentMeta(void) const{ return(this->meta_entries[this->iterator_position_meta]); } + inline const_pointer current(void) const{ return(&this->genotype_entries[this->iterator_position_runs]); } + inline const genotype_bitvector_type& currentBitvector(void) const{ return(this->bit_vectors->at(this->iterator_position_meta)); } + + + // Psuedo-iterator functionality + inline void operator++(void){ this->iterator_position_runs += this->currentMeta().runs; ++this->iterator_position_meta; } + inline void operator--(void){ this->iterator_position_runs -= this->currentMeta().runs; ++this->iterator_position_meta; } + inline const_reference operator[](const U32& position) const{ return(this->genotype_entries[this->iterator_position_runs + position]); } + inline const_reference at(const U32& position) const{ return(this->genotype_entries[this->iterator_position_runs + position]); } + +protected: + size_type n_entries; + size_type iterator_position_meta; + size_type iterator_position_runs; + bool owns_bitvectors; + const meta_type* meta_entries; + const_pointer genotype_entries; + const header_entry_type* index_entry; + container_bitvector_type* bit_vectors; +}; + +} +} + +#endif /* TOMAHAWK_BASE_GENOTYPE_CONTAINER_REFERENCE_H_ */ diff --git a/src/tomahawk/genotype_container_runlength.h b/src/tomahawk/genotype_container_runlength.h new file mode 100644 index 0000000..e592a0e --- /dev/null +++ b/src/tomahawk/genotype_container_runlength.h @@ -0,0 +1,123 @@ +#ifndef TOMAHAWK_BASE_GENOTYPE_CONTAINER_RUNLENGTH_H_ +#define TOMAHAWK_BASE_GENOTYPE_CONTAINER_RUNLENGTH_H_ + +#include + +#include "genotype_container_runlength_objects.h" +#include "genotype_objects.h" +#include "meta_entry.h" + +namespace Tomahawk{ +namespace Base{ + +template +class GenotypeContainerRunlength{ +private: + typedef GenotypeContainerRunlength self_type; + typedef GenotypeContainerRunlengthObjects value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef MetaEntry meta_type; + +public: + GenotypeContainerRunlength() : + n_entries(0), + __entries(nullptr) + {} + + GenotypeContainerRunlength(const char* const genotype_buffer, const size_t l_buffer_length, const size_t n_entries, const meta_type* const meta_entries) : + n_entries(n_entries), + __entries(static_cast(::operator new[](n_entries*sizeof(value_type)))) + { + assert(n_entries > 0); + assert(l_buffer_length % sizeof(T) == 0); + + size_t cumulative_position = 0; + for(size_t i = 0; i < this->size(); ++i){ + new( &this->__entries[i] ) value_type( &genotype_buffer[cumulative_position], meta_entries[i].runs * sizeof(T) ); + cumulative_position += meta_entries[i].runs * sizeof(T); + assert(this->__entries[i].size() > 0); + } + assert(cumulative_position == l_buffer_length); + } + + ~GenotypeContainerRunlength(){ + for(std::size_t i = 0; i < this->n_entries; ++i) + ((this->__entries + i)->~GenotypeContainerRunlengthObjects)(); + + ::operator delete[](static_cast(this->__entries)); + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->__entries[position]); } + inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } + inline reference operator[](const size_type& position){ return(this->__entries[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->__entries[position]); } + inline pointer data(void){ return(this->__entries); } + inline const_pointer data(void) const{ return(this->__entries); } + inline reference front(void){ return(this->__entries[0]); } + inline const_reference front(void) const{ return(this->__entries[0]); } + inline reference back(void){ return(this->__entries[this->n_entries - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + + // Iterator + inline iterator begin(){ return iterator(&this->__entries[0]); } + inline iterator end() { return iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + +private: + size_type n_entries; + pointer __entries; +}; + +} +} + + + +#endif /* TOMAHAWK_BASE_GENOTYPE_CONTAINER_RUNLENGTH_H_ */ diff --git a/src/tomahawk/genotype_container_runlength_objects.h b/src/tomahawk/genotype_container_runlength_objects.h new file mode 100644 index 0000000..9e59cac --- /dev/null +++ b/src/tomahawk/genotype_container_runlength_objects.h @@ -0,0 +1,114 @@ +#ifndef TOMAHAWK_BASE_GENOTYPE_CONTAINER_RUNLENGTH_OBJECTS_H_ +#define TOMAHAWK_BASE_GENOTYPE_CONTAINER_RUNLENGTH_OBJECTS_H_ + +#include "genotype_objects.h" + +namespace Tomahawk{ +namespace Base{ + +/**< + * Primary run-length encoded genotype objects used in Tomahawk. + * These higher-order objects are encoded in fixed-width basic + * unsigned primitives. + * The template parameter represents the encoded primitive type + * for a given Tomahawk file. + */ +template +class GenotypeContainerRunlengthObjects{ +private: + typedef GenotypeContainerRunlengthObjects self_type; + typedef Support::GenotypeDiploidRun value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + +public: + GenotypeContainerRunlengthObjects() : + n_entries(0), + __entries(nullptr) + {} + + GenotypeContainerRunlengthObjects(const char* const genotype_buffer, const size_t l_buffer_length) : + n_entries(l_buffer_length / sizeof(T)), + __entries(new value_type[this->n_entries]) + { + assert(l_buffer_length % sizeof(T) == 0); + memcpy(this->__entries, genotype_buffer, l_buffer_length); + } + + ~GenotypeContainerRunlengthObjects(){ + delete [] this->__entries; + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->__entries[position]); } + inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } + inline reference operator[](const size_type& position){ return(this->__entries[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->__entries[position]); } + inline pointer data(void){ return(this->__entries); } + inline const_pointer data(void) const{ return(this->__entries); } + inline reference front(void){ return(this->__entries[0]); } + inline const_reference front(void) const{ return(this->__entries[0]); } + inline reference back(void){ return(this->__entries[this->n_entries - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + + // Iterator + inline iterator begin(){ return iterator(&this->__entries[0]); } + inline iterator end() { return iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + +private: + size_type n_entries; + pointer __entries; +}; + +} +} + + + +#endif /* TOMAHAWK_BASE_GENOTYPE_CONTAINER_RUNLENGTH_OBJECTS_H_ */ diff --git a/src/tomahawk/genotype_meta_container_reference.h b/src/tomahawk/genotype_meta_container_reference.h new file mode 100644 index 0000000..1c337eb --- /dev/null +++ b/src/tomahawk/genotype_meta_container_reference.h @@ -0,0 +1,141 @@ +#ifndef TOMAHAWK_BASE_GENOTYPE_META_CONTAINER_REFERENCE_H_ +#define TOMAHAWK_BASE_GENOTYPE_META_CONTAINER_REFERENCE_H_ + +#include // size_t, ptrdiff_t + +#include "../support/type_definitions.h" +#include "../index/index_entry.h" +#include "genotype_container_reference.h" + +namespace Tomahawk{ + +template +class GenotypeMetaContainerReference{ +private: + typedef GenotypeMetaContainerReference self_type; + typedef Base::GenotypeContainerReference value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef Totempole::IndexEntry header_entry;; + +public: + GenotypeMetaContainerReference(const U64 n_samples) : + n_entries(0), + n_capacity(0), + n_samples(n_samples), + __entries(nullptr) + { + + } + + GenotypeMetaContainerReference(const U64 n_samples, const size_t n_capacity) : + n_entries(0), + n_capacity(n_capacity), + n_samples(n_samples), + __entries(static_cast(::operator new[](this->n_capacity*sizeof(value_type)))) + { + + } + + ~GenotypeMetaContainerReference(){ + for(size_type i = 0; i < this->size(); ++i) + ((this->__entries + i)->~value_type)(); + + ::operator delete[](static_cast(this->__entries)); + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->__entries[position]); } + inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } + inline reference operator[](const size_type& position){ return(this->__entries[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->__entries[position]); } + inline pointer data(void){ return(this->__entries); } + inline const_pointer data(void) const{ return(this->__entries); } + inline reference front(void){ return(this->__entries[0]); } + inline const_reference front(void) const{ return(this->__entries[0]); } + inline reference back(void){ return(this->__entries[this->n_entries - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + inline const size_type& capacity(void) const{ return(this->n_capacity); } + + // Iterator + inline iterator begin(){ return iterator(&this->__entries[0]); } + inline iterator end() { return iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + + // Update + bool addDataBlock(const char* const data, const size_t l_data, const header_entry& header){ + // Container is full + // Resize is required + if(this->n_entries + 1 == this->n_capacity || this->capacity() == 0) + return false; + + new( &this->__entries[this->n_entries] ) value_type(data, l_data, header, this->n_samples); + ++this->n_entries; + return true; + } + + const U64 countVariants(void) const{ + U64 n_total = 0; + for(U32 i = 0; i < this->size(); ++i) + n_total += this->at(i).getTotempole().size(); + + return(n_total); + } + + const U64& numberSamples(void) const{ return(this->n_samples); } + +private: + size_type n_entries; + size_type n_capacity; + U64 n_samples; + pointer __entries; +}; + +} + +#endif /* TOMAHAWK_BASE_GENOTYPE_META_CONTAINER_REFERENCE_H_ */ diff --git a/src/tomahawk/base/TomahawkSupport.h b/src/tomahawk/genotype_objects.h similarity index 54% rename from src/tomahawk/base/TomahawkSupport.h rename to src/tomahawk/genotype_objects.h index b353bb7..d70f351 100644 --- a/src/tomahawk/base/TomahawkSupport.h +++ b/src/tomahawk/genotype_objects.h @@ -4,29 +4,39 @@ namespace Tomahawk{ namespace Support{ -#pragma pack(1) +#pragma pack(push, 1) template -struct TomahawkRun{ +struct __attribute__((packed, aligned(1))) GenotypeDiploidRun{ public: - TomahawkRun(); // Disallowed ctor - ~TomahawkRun(); // Disallowed dtor + GenotypeDiploidRun(){} + GenotypeDiploidRun(const char* const buffer){ + T* t = reinterpret_cast(this->alleleA); + *t = *reinterpret_cast(buffer); + } + ~GenotypeDiploidRun(){} T alleleA: Constants::TOMAHAWK_ALLELE_PACK_WIDTH, alleleB: Constants::TOMAHAWK_ALLELE_PACK_WIDTH, runs: sizeof(T)*8 - Constants::TOMAHAWK_SNP_PACK_WIDTH; }; -#pragma pack(1) + template -struct TomahawkRunPacked{ +struct __attribute__((packed, aligned(1))) GenotypeDiploidRunPacked{ public: - TomahawkRunPacked(); // Disallowed ctor - ~TomahawkRunPacked(); // Disallowed dtor + GenotypeDiploidRunPacked(){} + GenotypeDiploidRunPacked(const char* const buffer){ + T* t = reinterpret_cast(this->alleles); + *t = *reinterpret_cast(buffer); + } + ~GenotypeDiploidRunPacked(){} T alleles: Constants::TOMAHAWK_SNP_PACK_WIDTH, runs: sizeof(T)*8 - Constants::TOMAHAWK_SNP_PACK_WIDTH; }; +#pragma pack(pop) + } // end support namespace Constants{ diff --git a/src/tomahawk/TomahawkImporterFilters.h b/src/tomahawk/import_filters.h similarity index 59% rename from src/tomahawk/TomahawkImporterFilters.h rename to src/tomahawk/import_filters.h index 8a5f3c2..ecf63e6 100644 --- a/src/tomahawk/TomahawkImporterFilters.h +++ b/src/tomahawk/import_filters.h @@ -3,9 +3,9 @@ namespace Tomahawk{ -struct TomahawkImporterFilters{ - TomahawkImporterFilters() : MAF(0), HWE_P(0), missingness(0.2){} - ~TomahawkImporterFilters(){} +struct ImporterFilters{ + ImporterFilters() : MAF(0), HWE_P(0), missingness(0.2){} + ~ImporterFilters(){} double MAF; double HWE_P; diff --git a/src/tomahawk/import_writer.cpp b/src/tomahawk/import_writer.cpp new file mode 100644 index 0000000..ed08341 --- /dev/null +++ b/src/tomahawk/import_writer.cpp @@ -0,0 +1,239 @@ +#include "../index/tomahawk_header.h" +#include "import_writer.h" + +namespace Tomahawk { + +ImportWriter::ImportWriter(const filter_type& filter) : + flush_limit(1000000), + n_variants_limit(1024), + blocksWritten_(0), + variants_written_(0), + largest_uncompressed_block_(0), + filter(filter), + rleController_(nullptr), + buffer_rle_(this->flush_limit*2), + buffer_meta_(this->flush_limit*2), + vcf_header_(nullptr) +{} + +ImportWriter::~ImportWriter(){ + delete this->rleController_; + this->buffer_rle_.deleteAll(); + this->buffer_meta_.deleteAll(); +} + +bool ImportWriter::Open(const std::string output){ + this->filename = output; + this->CheckOutputNames(output); + this->stream.open(this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX, std::ios::out | std::ios::binary); + + // Check streams + if(!this->stream.good()){ + std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not open: " << this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX << "!" << std::endl; + return false; + } + + if(!SILENT){ + std::cerr << Helpers::timestamp("LOG", "WRITER") << "Opening: " << this->basePath + this->baseName + '.' + Constants::OUTPUT_SUFFIX << "..." << std::endl; + } + + // Write Tomahawk and Totempole headers + this->WriteHeaders(); + + // Determine flush limit + this->DetermineFlushLimit(); + + return true; +} + +void ImportWriter::DetermineFlushLimit(void){ + this->flush_limit = this->vcf_header_->samples * this->n_variants_limit / 10; // Worst case + if(this->vcf_header_->samples <= Constants::UPPER_LIMIT_SAMPLES_8B - 1) + this->flush_limit *= sizeof(BYTE); + else if(this->vcf_header_->samples <= Constants::UPPER_LIMIT_SAMPLES_16B - 1) + this->flush_limit *= sizeof(U16); + else if(this->vcf_header_->samples <= Constants::UPPER_LIMIT_SAMPLES_32B - 1) + this->flush_limit *= sizeof(U32); + else this->flush_limit *= sizeof(U64); +} + +bool ImportWriter::OpenExtend(const std::string output){ + + return true; +} + +int ImportWriter::WriteHeaders(void){ + if(this->vcf_header_ == nullptr){ + std::cerr << Helpers::timestamp("ERROR", "INTERNAL") << "Header not set!" << std::endl; + exit(1); + } + + // Move data from VCF header to TomahawkHeader + TomahawkHeader header; + header.magic_.n_contigs = this->vcf_header_->contigs.size(); + header.magic_.n_samples = this->vcf_header_->samples; + header.sample_names_ = new std::string[this->vcf_header_->samples]; + header.contigs_ = new Totempole::HeaderContig[this->vcf_header_->contigs.size()]; + + for(U32 i = 0; i < this->vcf_header_->contigs.size(); ++i){ + + header.contigs_[i].interpret(this->vcf_header_->contigs[i].length, + this->vcf_header_->contigs[i].name.size(), + this->vcf_header_->contigs[i].name); + } + + for(U32 i = 0; i < this->vcf_header_->samples; ++i) + header.sample_names_[i] = this->vcf_header_->sampleNames[i]; + + for(U32 i = 0; i < this->vcf_header_->literal_lines.size(); ++i) + header.literals_ += this->vcf_header_->literal_lines[i] + '\n'; + + const std::string command = "##tomahawk_importCommand=" + std::string(Constants::LITERAL_COMMAND_LINE) + + "; VERSION=" + std::string(VERSION) + + "; Date=" + Tomahawk::Helpers::datetime() + "; SIMD=" + SIMD_MAPPING[SIMD_VERSION]; + + header.literals_ += command; + + return(header.write(this->stream)); +} + +void ImportWriter::WriteFinal(index_type& index, footer_type& footer){ + footer.l_largest_uncompressed = this->largest_uncompressed_block_; + footer.offset_end_of_data = this->stream.tellp(); + index.setSorted(true); + + this->stream << index; + this->stream << footer; +} + +void ImportWriter::setHeader(VCF::VCFHeader& header){ + this->vcf_header_ = &header; + this->rleController_ = new Algorithm::GenotypeEncoder(header.samples); + this->rleController_->DetermineBitWidth(); +} + +bool ImportWriter::add(const VCF::VCFLine& line){ + const U32 meta_start_pos = this->buffer_meta_.size(); + const U32 rle_start_pos = this->buffer_rle_.size(); + if(!this->rleController_->RunLengthEncode(line, this->buffer_meta_, this->buffer_rle_)){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + return false; + } + + const U64 n_runs = (this->buffer_rle_.n_chars - rle_start_pos)/this->rleController_->getBitWidth(); + const MetaEntryBase& base_meta = *reinterpret_cast(&this->buffer_meta_[meta_start_pos]); + + if(n_runs == 1){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + //std::cerr << "singleton" << std::endl; + return false; + } + + if(base_meta.HWE_P < this->filter.HWE_P){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + //std::cerr << "HWE_P < " << this->filter.HWE_P << ": " << base_meta.HWE_P << '\t' << base_meta << std::endl; + return false; + } + + if(base_meta.MAF < this->filter.MAF){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + //std::cerr << "MAF < " << this->filter.MAF << ": " << base_meta.MAF << '\t' << base_meta << std::endl; + return false; + } + + if(this->totempole_entry.min_position == 0) + this->totempole_entry.min_position = line.position; + + this->totempole_entry.max_position = line.position; + ++this->totempole_entry; + + return true; +} + +bool ImportWriter::add(const BCF::BCFEntry& line){ + const U32 meta_start_pos = this->buffer_meta_.size(); + const U32 rle_start_pos = this->buffer_rle_.size(); + + if(!this->rleController_->RunLengthEncode(line, this->buffer_meta_, this->buffer_rle_)){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + return false; + } + + const U64 n_runs = (this->buffer_rle_.n_chars - rle_start_pos)/this->rleController_->getBitWidth(); + const MetaEntryBase& base_meta = *reinterpret_cast(&this->buffer_meta_[meta_start_pos]); + + if(n_runs == 1){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + return false; + } + + if(base_meta.HWE_P < this->filter.HWE_P){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + //std::cerr << "HWE_P < " << this->filter.HWE_P << ": " << base_meta.HWE_P << std::endl; + return false; + } + + + + if(base_meta.MAF < this->filter.MAF){ + this->buffer_meta_.n_chars = meta_start_pos; // reroll back + this->buffer_rle_.n_chars = rle_start_pos; // reroll back + //std::cerr << "MAF < " << this->filter.MAF << ": " << base_meta.MAF << std::endl; + return false; + } + + if(this->totempole_entry.min_position == 0) + this->totempole_entry.min_position = line.body->POS + 1; + + this->totempole_entry.max_position = line.body->POS + 1; + ++this->totempole_entry; + + return true; +} + +// flush and write +bool ImportWriter::flush(void){ + if(this->buffer_meta_.size() == 0){ + //std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Cannot flush writer with 0 entries..." << std::endl; + return false; + } + + this->totempole_entry.byte_offset = this->stream.tellp(); // IO offset in Tomahawk output + this->gzip_controller_.Deflate(this->buffer_meta_, this->buffer_rle_); // Deflate block + this->stream << this->gzip_controller_; // Write tomahawk output + this->gzip_controller_.Clear(); // Clean up gzip controller + + // Keep track of largest block observed + if(this->buffer_meta_.size() > this->largest_uncompressed_block_) + this->largest_uncompressed_block_ = this->buffer_meta_.size(); + + this->totempole_entry.uncompressed_size = this->buffer_meta_.size(); // Store uncompressed size + this->totempole_entry.byte_offset_end = this->stream.tellp(); // IO offset in Tomahawk output + + ++this->blocksWritten_; // update number of blocks written + this->variants_written_ += this->totempole_entry.size(); // update number of variants written + + this->reset(); // reset buffers + return true; +} + +void ImportWriter::CheckOutputNames(const std::string& input){ + std::vector paths = Helpers::filePathBaseExtension(input); + this->basePath = paths[0]; + if(this->basePath.size() > 0) + this->basePath += '/'; + + if(paths[3].size() == Constants::OUTPUT_SUFFIX.size() && strncasecmp(&paths[3][0], &Constants::OUTPUT_SUFFIX[0], Constants::OUTPUT_SUFFIX.size()) == 0) + this->baseName = paths[2]; + else this->baseName = paths[1]; +} + + +} /* namespace Tomahawk */ diff --git a/src/tomahawk/TomahawkImportWriter.h b/src/tomahawk/import_writer.h similarity index 63% rename from src/tomahawk/TomahawkImportWriter.h rename to src/tomahawk/import_writer.h index ceb55f8..e1b4bf9 100644 --- a/src/tomahawk/TomahawkImportWriter.h +++ b/src/tomahawk/import_writer.h @@ -3,35 +3,41 @@ #include -#include "../support/TypeDefinitions.h" +#include "../algorithm/compression/genotype_encoder.h" +#include "../support/type_definitions.h" #include "../io/BasicBuffer.h" #include "../io/BasicWriters.h" #include "../io/compression/TGZFController.h" #include "../io/vcf/VCFHeaderConstants.h" #include "../io/vcf/VCFLines.h" #include "../io/vcf/VCFHeader.h" -#include "../algorithm/compression/TomahawkImportRLE.h" -#include "base/TomahawkEntryMeta.h" -#include "../totempole/TotempoleEntry.h" -#include "../totempole/TotempoleReader.h" +#include "../index/index.h" #include "../support/simd_definitions.h" -#include "TomahawkImporterFilters.h" +#include "import_filters.h" +#include "meta_entry.h" +#include "../index/footer.h" namespace Tomahawk { -class TomahawkImportWriter { - typedef IO::BasicBuffer buffer_type; - typedef TomahawkImporterFilters filter_type; +class ImportWriter { +private: + typedef ImportWriter self_type; + typedef IO::BasicBuffer buffer_type; + typedef ImporterFilters filter_type; + typedef Totempole::IndexEntry index_entry_type; + typedef Index index_type; + typedef Totempole::Footer footer_type; public: - TomahawkImportWriter(const filter_type& filter); - ~TomahawkImportWriter(); + ImportWriter(const filter_type& filter); + ~ImportWriter(); bool Open(const std::string output); void DetermineFlushLimit(void); bool OpenExtend(const std::string output); - void WriteHeaders(void); - void WriteFinal(void); + int WriteHeaders(void); + void WriteFinal(index_type& container, footer_type& footer); + void setHeader(VCF::VCFHeader& header); bool add(const VCF::VCFLine& line); bool add(const BCF::BCFEntry& line); @@ -44,7 +50,7 @@ class TomahawkImportWriter { inline void TotempoleSwitch(const U32 contig, const U32 minPos){ this->totempole_entry.reset(); this->totempole_entry.contigID = contig; - this->totempole_entry.minPosition = minPos; + this->totempole_entry.min_position = minPos; } // flush and write @@ -53,7 +59,7 @@ class TomahawkImportWriter { inline bool checkSize() const{ // if the current size is larger than our desired output block size, return TRUE to trigger a flush // or if the number of entries written to buffer exceeds our set limit - if(this->totempole_entry.variants >= this->n_variants_limit || this->buffer_rle_.size() >= this->flush_limit){ + if(this->totempole_entry.n_variants >= this->n_variants_limit || this->buffer_rle_.size() >= this->flush_limit){ //std::cerr << "flushing: " << this->totempole_entry_.variants << '/' << this->n_variants_limit << '\t' << this->buffer_rle_.size() << '/' << this->flush_limit << std::endl; return true; } @@ -68,11 +74,11 @@ class TomahawkImportWriter { inline U32 GetVariantsWritten(void) const{ return this->variants_written_; } - inline Totempole::TotempoleEntry& getTotempoleEntry(void){ return(this->totempole_entry); } + inline index_entry_type& getTotempoleEntry(void){ return(this->totempole_entry); } public: - std::ofstream streamTomahawk; // stream - std::ofstream streamTotempole; // stream + std::ofstream stream; // stream + U32 flush_limit; U32 n_variants_limit; U32 blocksWritten_; // number of blocks written @@ -80,11 +86,11 @@ class TomahawkImportWriter { U32 largest_uncompressed_block_;// size of largest block in b const filter_type& filter; // filters - Totempole::TotempoleEntry totempole_entry; + index_entry_type totempole_entry; IO::TGZFController gzip_controller_; - Algorithm::TomahawkImportRLE* rleController_; - IO::BasicBuffer buffer_rle_; // run lengths - IO::BasicBuffer buffer_meta_; // meta data for run lengths (chromosome, position, ref/alt) + Algorithm::GenotypeEncoder* rleController_; + buffer_type buffer_rle_; // run lengths + buffer_type buffer_meta_; // meta data for run lengths (chromosome, position, ref/alt) VCF::VCFHeader* vcf_header_; diff --git a/src/tomahawk/TomahawkSlaveSIMDHelper.h b/src/tomahawk/ld_calculation_simd_helper.h similarity index 91% rename from src/tomahawk/TomahawkSlaveSIMDHelper.h rename to src/tomahawk/ld_calculation_simd_helper.h index 291ac3e..bec88ff 100644 --- a/src/tomahawk/TomahawkSlaveSIMDHelper.h +++ b/src/tomahawk/ld_calculation_simd_helper.h @@ -7,9 +7,9 @@ namespace Tomahawk{ namespace Support{ template -struct TomahawkSlaveSIMDHelper{ +struct LDCalculationSIMDHelper{ public: - TomahawkSlaveSIMDHelper(void) : + LDCalculationSIMDHelper(void) : #if SIMD_AVAILABLE == 1 counters((U64*)_mm_malloc(sizeof(U64)*16, Y)), scalarA((BYTE*)_mm_malloc(sizeof(BYTE)*8, Y)), @@ -27,7 +27,7 @@ struct TomahawkSlaveSIMDHelper{ memset(this->counters, 0, sizeof(U64)*16); } - ~TomahawkSlaveSIMDHelper(){ + ~LDCalculationSIMDHelper(){ #if SIMD_AVAILABLE == 1 _mm_free(this->counters); _mm_free(this->scalarA); @@ -44,7 +44,7 @@ struct TomahawkSlaveSIMDHelper{ } public: - U64* counters; + U64* counters; BYTE* scalarA; BYTE* scalarB; BYTE* scalarC; diff --git a/src/tomahawk/TomahawkCalculateSlave.h b/src/tomahawk/ld_calculation_slave.h similarity index 77% rename from src/tomahawk/TomahawkCalculateSlave.h rename to src/tomahawk/ld_calculation_slave.h index 9c146c8..184ebda 100644 --- a/src/tomahawk/TomahawkCalculateSlave.h +++ b/src/tomahawk/ld_calculation_slave.h @@ -1,5 +1,5 @@ -#ifndef TOMAHAWK_TOMAHAWKCALCULATESLAVE_H_ -#define TOMAHAWK_TOMAHAWKCALCULATESLAVE_H_ +#ifndef TOMAHAWK_LD_CALCULATION_SLAVE_H_ +#define TOMAHAWK_LD_CALCULATION_SLAVE_H_ #include #include @@ -8,16 +8,16 @@ #include "../support/simd_definitions.h" #include "../algorithm/spinlock.h" -#include "TomahawkOutput/TomahawkOutputLD.h" -#include "../interface/ProgressBar.h" +#include "../interface/progressbar.h" #include "TomahawkCalcParameters.h" -#include "../math/FisherMath.h" -#include "../algorithm/LoadBalancerBlock.h" -#include "../algorithm/GenotypeBitPacker.h" -#include "TomahawkSlaveSIMDHelper.h" +#include "../algorithm/load_balancer_block.h" +#include "../algorithm/genotype_bitpacker.h" #include "../io/BasicWriters.h" -#include "TomahawkBlockManager.h" -#include "TomahawkOutput/TomahawkOutputManager.h" +#include "../math/fisher_math.h" +#include "genotype_meta_container_reference.h" +#include "ld_calculation_simd_helper.h" +#include "two/output_entry_support.h" +#include "../io/output_writer.h" // Method 1: None: Input-specified (default) // Method 2: Phased Vectorized No-Missing @@ -26,13 +26,13 @@ // Method 5: Phased A1 and Phased A2 // Method 6: All algorithms comparison (debug) // Method 7: All algorithms run-time output (debug) -#define SLAVE_DEBUG_MODE 1 +#define SLAVE_DEBUG_MODE 1 namespace Tomahawk{ #if SLAVE_DEBUG_MODE == 7 -#pragma pack(1) -struct __costHelper{ +#pragma pack(push, 1) +struct __attribute__((packed, aligned(1))) __costHelper{ // RLE_A and RLE_B for A1 float RLE_A, RLE_B; // cycles A1_P and A1_U @@ -49,6 +49,7 @@ struct __costHelper{ buffer.Add(reinterpret_cast(&entry), sizeof(__costHelper)); } }; +#pragma pack(pop) #elif SLAVE_DEBUG_MODE == 6 /* This supportive structure is only used internally for @@ -57,7 +58,7 @@ struct __costHelper{ */ struct __methodCompare{ typedef __methodCompare self_type; - typedef Tomahawk::Support::TomahawkOutputLD helper_type; + typedef Tomahawk::Support::OutputEntrySupport helper_type; __methodCompare(){} ~__methodCompare(){} @@ -67,8 +68,8 @@ struct __methodCompare{ friend std::ostream& operator<<(std::ostream& os, const self_type& m){ // P, PV, PVM, U, UV, UVM - os << "P\t" << m.phased[0][0] << '\t' << m.phased[0][1] << '\t' << m.phased[0][2] << '\t' << m.phased[0][3] << std::endl; - os << "PV\t" << m.phased[1][0] << '\t' << m.phased[1][1] << '\t' << m.phased[1][2] << '\t' << m.phased[1][3] << std::endl; + os << "P\t" << m.phased[0][0] << '\t' << m.phased[0][1] << '\t' << m.phased[0][2] << '\t' << m.phased[0][3] << std::endl; + os << "PV\t" << m.phased[1][0] << '\t' << m.phased[1][1] << '\t' << m.phased[1][2] << '\t' << m.phased[1][3] << std::endl; os << "PVM\t" << m.phased[2][0] << '\t' << m.phased[2][1] << '\t' << m.phased[2][2] << '\t' << m.phased[2][3] << std::endl; os << "U\t"; for(U32 i = 0; i < 8; ++i) os << m.unphased[0][i] << '\t'; os << m.unphased[0][8] << std::endl;; @@ -148,24 +149,25 @@ struct __methodCompare{ #define UNPHASED_UPPER_MASK 170 // 10101010b #define UNPHASED_LOWER_MASK 85 // 01010101b -#define FILTER_UNPHASED_BYTE(A, B) (((((A & UNPHASED_UPPER_MASK) | (B & UNPHASED_LOWER_MASK)) & UNPHASED_LOWER_MASK) << 1) & A) +#define FILTER_UNPHASED_BYTE(A, B) (((((A & UNPHASED_UPPER_MASK) | (B & UNPHASED_LOWER_MASK)) & UNPHASED_LOWER_MASK) << 1) & A) #define FILTER_UNPHASED_BYTE_PAIR(A, B, C, D) ((FILTER_UNPHASED_BYTE(A, B) >> 1) | FILTER_UNPHASED_BYTE(C, D)) -#define FILTER_UNPHASED_BYTE_SPECIAL(A) (((A >> 1) & A) & UNPHASED_LOWER_MASK) +#define FILTER_UNPHASED_BYTE_SPECIAL(A) (((A >> 1) & A) & UNPHASED_LOWER_MASK) #if SIMD_VERSION == 6 // AVX-512: UNTESTED -const VECTOR_TYPE ONE_MASK = _mm512_set1_epi8(255); // 11111111b +#define VECTOR_TYPE __m512i +const VECTOR_TYPE ONE_MASK = _mm512_set1_epi8(255); // 11111111b const VECTOR_TYPE maskUnphasedHigh = _mm512_set1_epi8(UNPHASED_UPPER_MASK); // 10101010b const VECTOR_TYPE maskUnphasedLow = _mm512_set1_epi8(UNPHASED_LOWER_MASK); // 01010101b -#define PHASED_ALTALT(A,B) _mm512_and_si512(A, B) -#define PHASED_REFREF(A,B) _mm512_and_si512(_mm512_xor_si512(A, ONE_MASK), _mm512_xor_si512(B, ONE_MASK)) -#define PHASED_ALTREF(A,B) _mm512_and_si512(_mm512_xor_si512(A, B), B) -#define PHASED_REFALT(A,B) _mm512_and_si512(_mm512_xor_si512(A, B), A) +#define PHASED_ALTALT(A,B) _mm512_and_si512(A, B) +#define PHASED_REFREF(A,B) _mm512_and_si512(_mm512_xor_si512(A, ONE_MASK), _mm512_xor_si512(B, ONE_MASK)) +#define PHASED_ALTREF(A,B) _mm512_and_si512(_mm512_xor_si512(A, B), B) +#define PHASED_REFALT(A,B) _mm512_and_si512(_mm512_xor_si512(A, B), A) #define PHASED_ALTALT_MASK(A,B,M) _mm512_and_si512(PHASED_ALTALT(A, B), M) #define PHASED_REFREF_MASK(A,B,M) _mm512_and_si512(PHASED_REFREF(A, B), M) #define PHASED_ALTREF_MASK(A,B,M) _mm512_and_si512(PHASED_ALTREF(A, B), M) #define PHASED_REFALT_MASK(A,B,M) _mm512_and_si512(PHASED_REFALT(A, B), M) -#define MASK_MERGE(A,B) _mm512_xor_si512(_mm512_or_si512(A, B), ONE_MASK) +#define MASK_MERGE(A,B) _mm512_xor_si512(_mm512_or_si512(A, B), ONE_MASK) #define POPCOUNT(A, B) { \ __m256i tempA = _mm512_extracti64x4_epi64(B, 0); \ @@ -187,19 +189,19 @@ const VECTOR_TYPE maskUnphasedLow = _mm512_set1_epi8(UNPHASED_LOWER_MASK); // 0 #elif SIMD_VERSION == 5 // AVX2 #define VECTOR_TYPE __m256i -const VECTOR_TYPE ONE_MASK = _mm256_set1_epi8(255); // 11111111b +const VECTOR_TYPE ONE_MASK = _mm256_set1_epi8(255); // 11111111b const VECTOR_TYPE maskUnphasedHigh = _mm256_set1_epi8(UNPHASED_UPPER_MASK); // 10101010b const VECTOR_TYPE maskUnphasedLow = _mm256_set1_epi8(UNPHASED_LOWER_MASK); // 01010101b -#define PHASED_ALTALT(A,B) _mm256_and_si256(A, B) -#define PHASED_REFREF(A,B) _mm256_and_si256(_mm256_xor_si256(A, ONE_MASK), _mm256_xor_si256(B, ONE_MASK)) -#define PHASED_ALTREF(A,B) _mm256_and_si256(_mm256_xor_si256(A, B), B) -#define PHASED_REFALT(A,B) _mm256_and_si256(_mm256_xor_si256(A, B), A) +#define PHASED_ALTALT(A,B) _mm256_and_si256(A, B) +#define PHASED_REFREF(A,B) _mm256_and_si256(_mm256_xor_si256(A, ONE_MASK), _mm256_xor_si256(B, ONE_MASK)) +#define PHASED_ALTREF(A,B) _mm256_and_si256(_mm256_xor_si256(A, B), B) +#define PHASED_REFALT(A,B) _mm256_and_si256(_mm256_xor_si256(A, B), A) #define PHASED_ALTALT_MASK(A,B,M) _mm256_and_si256(PHASED_ALTALT(A, B), M) #define PHASED_REFREF_MASK(A,B,M) _mm256_and_si256(PHASED_REFREF(A, B), M) #define PHASED_ALTREF_MASK(A,B,M) _mm256_and_si256(PHASED_ALTREF(A, B), M) #define PHASED_REFALT_MASK(A,B,M) _mm256_and_si256(PHASED_REFALT(A, B), M) -#define MASK_MERGE(A,B) _mm256_xor_si256(_mm256_or_si256(A, B), ONE_MASK) +#define MASK_MERGE(A,B) _mm256_xor_si256(_mm256_or_si256(A, B), ONE_MASK) // Software intrinsic popcount #define POPCOUNT(A, B) { \ @@ -215,19 +217,19 @@ const VECTOR_TYPE maskUnphasedLow = _mm256_set1_epi8(UNPHASED_LOWER_MASK); // 0 #elif SIMD_VERSION >= 2 // SSE2+ #define VECTOR_TYPE __m128i -const VECTOR_TYPE ONE_MASK = _mm_set1_epi8(255); // 11111111b +const VECTOR_TYPE ONE_MASK = _mm_set1_epi8(255); // 11111111b const VECTOR_TYPE maskUnphasedHigh = _mm_set1_epi8(UNPHASED_UPPER_MASK); // 10101010b const VECTOR_TYPE maskUnphasedLow = _mm_set1_epi8(UNPHASED_LOWER_MASK); // 01010101b -#define PHASED_ALTALT(A,B) _mm_and_si128(A, B) -#define PHASED_REFREF(A,B) _mm_and_si128(_mm_xor_si128(A, ONE_MASK), _mm_xor_si128(B, ONE_MASK)) -#define PHASED_ALTREF(A,B) _mm_and_si128(_mm_xor_si128(A, B), B) -#define PHASED_REFALT(A,B) _mm_and_si128(_mm_xor_si128(A, B), A) +#define PHASED_ALTALT(A,B) _mm_and_si128(A, B) +#define PHASED_REFREF(A,B) _mm_and_si128(_mm_xor_si128(A, ONE_MASK), _mm_xor_si128(B, ONE_MASK)) +#define PHASED_ALTREF(A,B) _mm_and_si128(_mm_xor_si128(A, B), B) +#define PHASED_REFALT(A,B) _mm_and_si128(_mm_xor_si128(A, B), A) #define PHASED_ALTALT_MASK(A,B,M) _mm_and_si128(PHASED_ALTALT(A, B), M) #define PHASED_REFREF_MASK(A,B,M) _mm_and_si128(PHASED_REFREF(A, B), M) #define PHASED_ALTREF_MASK(A,B,M) _mm_and_si128(PHASED_ALTREF(A, B), M) #define PHASED_REFALT_MASK(A,B,M) _mm_and_si128(PHASED_REFALT(A, B), M) -#define MASK_MERGE(A,B) _mm_xor_si128(_mm_or_si128(A, B), ONE_MASK) +#define MASK_MERGE(A,B) _mm_xor_si128(_mm_or_si128(A, B), ONE_MASK) #if SIMD_VERSION >= 3 #define POPCOUNT(A, B) { \ @@ -250,37 +252,40 @@ const VECTOR_TYPE maskUnphasedLow = _mm_set1_epi8(UNPHASED_LOWER_MASK); // 0101 #endif // ENDIF SIMD_AVAILABLE == 1 template -class TomahawkCalculateSlave{ - //Basic typedefs - typedef TomahawkCalculateSlave self_type; - typedef TomahawkBlockManager manager_type; - typedef TomahawkBlock controller_type; - typedef const TomahawkEntryMeta meta_type; - typedef const Support::TomahawkRun run_type; - typedef Totempole::TotempoleEntry totempole_entry_type; - typedef IO::TomahawkOutputManager output_manager_type; - typedef Support::TomahawkOutputLD helper_type; - typedef TomahawkBlockPackedPair<> simd_pair; +class LDSlave{ + typedef LDSlave self_type; + typedef GenotypeMetaContainerReference manager_type; + typedef Base::GenotypeContainerReference block_type; + typedef const MetaEntry meta_type; + typedef const Support::GenotypeDiploidRun run_type; + typedef Totempole::IndexEntry totempole_entry_type; + typedef IO::OutputWriter output_writer_type; + typedef Support::OutputEntrySupport helper_type; + typedef Base::GenotypeBitvector<> simd_pair; + typedef Base::GenotypeContainerRunlengthObjects rle_type; + typedef Interface::ProgressBar progress_bar_type; + typedef Support::LDCalculationSIMDHelper<> simd_helper_type; + typedef TomahawkCalcParameters parameter_type; // Work orders - typedef Tomahawk::LoadBalancerBlock order_type; - typedef std::vector work_order; + typedef Tomahawk::LoadBalancerBlock order_type; + typedef std::vector work_order; // Function pointers - typedef void (self_type::*phaseFunction)(const controller_type& block1, const controller_type block2); + typedef void (self_type::*phaseFunction)(const block_type& block1, const block_type& block2); public: - TomahawkCalculateSlave(const manager_type& manager, - output_manager_type& writer, - Interface::ProgressBar& progress, - const TomahawkCalcParameters& parameters, - const work_order& orders); + LDSlave(const manager_type& manager, + output_writer_type& writer, + Interface::ProgressBar& progress, + const TomahawkCalcParameters& parameters, + const work_order& orders); - ~TomahawkCalculateSlave(); + ~LDSlave(); - TomahawkCalculateSlave& operator=(const TomahawkCalculateSlave& other); - TomahawkCalculateSlave& operator=(TomahawkCalculateSlave&& other) noexcept; - TomahawkCalculateSlave& operator+=(const TomahawkCalculateSlave& other); + LDSlave& operator=(const LDSlave& other); + LDSlave& operator=(LDSlave&& other) noexcept; + LDSlave& operator+=(const LDSlave& other); std::thread* Start(void){ this->thread = std::thread(&self_type::Calculate, this); @@ -292,31 +297,39 @@ class TomahawkCalculateSlave{ inline const U64& getNoHets(void) const{ return this->no_uncertainty; } inline const U64& getInsufficientData(void) const{ return this->insufficent_alleles; } inline U64 getComparisons(void) const{ return(this->impossible + this->possible + this->insufficent_alleles); } - output_manager_type& getOutputManager(void){ return(this->output_manager); } + inline output_writer_type& getWriter(void){ return(this->output_writer); } private: bool Calculate(); bool DiagonalWorkOrder(const order_type& order); bool SquareWorkOrder(const order_type& order); - bool CompareBlocks(controller_type& block1); - bool CompareBlocks(controller_type& block1, controller_type block2); - inline void CompareBlocksFunction(const controller_type& block1, const controller_type block2); - inline void CompareBlocksFunctionForcedPhased(const controller_type& block1, const controller_type block2); - inline void CompareBlocksFunctionForcedUnphased(const controller_type& block1, const controller_type block2); - bool CalculateLDPhased(const controller_type& a, const controller_type& b); + + // Comparator functions + bool CompareBlocks(block_type& block1); + bool CompareBlocks(block_type& block1, block_type& block2); + inline void CompareBlocksFunction(const block_type& block1, const block_type& block2); + inline void CompareBlocksFunctionForcedPhased(const block_type& block1, const block_type& block2); + inline void CompareBlocksFunctionForcedUnphased(const block_type& block1, const block_type& block2); + + // Phased functions + bool CalculateLDPhased(const block_type& block1, const block_type& block2); bool CalculateLDPhasedMath(void); - bool CalculateLDPhasedVectorized(const controller_type& a, const controller_type& b); - bool CalculateLDPhasedVectorizedNoMissing(const controller_type& a, const controller_type& b); - bool CalculateLDUnphased(const controller_type& a, const controller_type& b); - bool CalculateLDUnphasedVectorized(const controller_type& a, const controller_type& b); - bool CalculateLDUnphasedVectorizedNoMissing(const controller_type& a, const controller_type& b); + bool CalculateLDPhasedVectorized(const block_type& block1, const block_type& block2); + bool CalculateLDPhasedVectorizedNoMissing(const block_type& block1, const block_type& block2); + + // Unphased functions + bool CalculateLDUnphased(const block_type& block1, const block_type& block2); + bool CalculateLDUnphasedVectorized(const block_type& block1, const block_type& block2); + bool CalculateLDUnphasedVectorizedNoMissing(const block_type& block1, const block_type& block2); bool CalculateLDUnphasedMath(void); + + // General functions double EstimateChiSq(const double& target, const double& p, const double& q) const; bool ChooseF11Calculate(const double& target, const double& p, const double& q); - void setFLAGs(const controller_type& a, const controller_type& b); + void setFLAGs(const block_type& block1, const block_type& block2); private: - const TomahawkCalcParameters& parameters; + const parameter_type& parameters; // Counters U32 block_comparisons; @@ -332,7 +345,7 @@ class TomahawkCalculateSlave{ //U64 false_negative; helper_type helper; - Support::TomahawkSlaveSIMDHelper<> helper_simd; + simd_helper_type helper_simd; Algorithm::FisherMath fisherController; const manager_type& manager; @@ -340,10 +353,10 @@ class TomahawkCalculateSlave{ std::thread thread; // writer manager - output_manager_type output_manager; // each thread has their own output manager + output_writer_type output_writer; // each thread has their own output manager with its own buffer // progress - Interface::ProgressBar& progress; + progress_bar_type& progress; // function pointers phaseFunction phase_function_across; @@ -360,15 +373,15 @@ class TomahawkCalculateSlave{ }; template -TomahawkCalculateSlave::TomahawkCalculateSlave(const manager_type& manager, - output_manager_type& writer, - Interface::ProgressBar& progress, - const TomahawkCalcParameters& parameters, +LDSlave::LDSlave(const manager_type& manager, + output_writer_type& writer, + progress_bar_type& progress, + const parameter_type& parameters, const work_order& orders) : parameters(parameters), block_comparisons(0), variant_comparisons(0), - samples(manager.header.getSamples()), + samples(manager.numberSamples()), impossible(0), possible(0), no_uncertainty(0), @@ -377,7 +390,7 @@ TomahawkCalculateSlave::TomahawkCalculateSlave(const manager_type& manager, //false_negative(0), fisherController(1024), manager(manager), - output_manager(writer), + output_writer(writer), progress(progress), phase_function_across(nullptr), orders(orders), @@ -387,39 +400,40 @@ TomahawkCalculateSlave::TomahawkCalculateSlave(const manager_type& manager, phased_unbalanced_adjustment((this->samples*2)%8), unphased_unbalanced_adjustment(this->samples%4) { - if(this->parameters.force == TomahawkCalcParameters::force_method::none) + // Decide block comparator function + if(this->parameters.force == parameter_type::force_method::none) this->phase_function_across = &self_type::CompareBlocksFunction; - else if(this->parameters.force == TomahawkCalcParameters::force_method::phasedFunction) + else if(this->parameters.force == parameter_type::force_method::phasedFunction) this->phase_function_across = &self_type::CompareBlocksFunctionForcedPhased; else this->phase_function_across = &self_type::CompareBlocksFunctionForcedUnphased; } template -TomahawkCalculateSlave::~TomahawkCalculateSlave(){ } +LDSlave::~LDSlave(){ } // Reduce function template -TomahawkCalculateSlave& TomahawkCalculateSlave::operator+=(const TomahawkCalculateSlave& other){ - this->block_comparisons += other.block_comparisons; +LDSlave& LDSlave::operator+=(const LDSlave& other){ + this->block_comparisons += other.block_comparisons; this->variant_comparisons += other.variant_comparisons; - this->impossible += other.impossible; - this->possible += other.possible; - this->no_uncertainty += other.no_uncertainty; + this->impossible += other.impossible; + this->possible += other.possible; + this->no_uncertainty += other.no_uncertainty; this->insufficent_alleles += other.insufficent_alleles; - //this->false_positive += other.false_positive; - //this->false_negative += other.false_negative; - this->output_manager += other.output_manager; + //this->false_positive += other.false_positive; + //this->false_negative += other.false_negative; + this->output_writer += other.output_writer; return(*this); } template -void TomahawkCalculateSlave::setFLAGs(const controller_type& a, const controller_type& b){ +void LDSlave::setFLAGs(const block_type& block1, const block_type& block2){ // If long range - const meta_type& mA = a.currentMeta(); - const meta_type& mB = b.currentMeta(); + const meta_type& mA = block1.currentMeta(); + const meta_type& mB = block2.currentMeta(); - if(b.support->contigID == a.support->contigID) + if(block1.getTotempole().contigID == block2.getTotempole().contigID) this->helper.setSameContig(); if((mB.position >> 2) - (mA.position >> 2) > LONG_RANGE_THRESHOLD) @@ -443,15 +457,17 @@ void TomahawkCalculateSlave::setFLAGs(const controller_type& a, const control } template -bool TomahawkCalculateSlave::CalculateLDUnphased(const controller_type& a, const controller_type& b){ - if(a.meta[a.metaPointer].MAF == 0 || b.meta[b.metaPointer].MAF == 0) +bool LDSlave::CalculateLDUnphased(const block_type& block1, const block_type& block2){ + if(block1.currentMeta().MAF == 0 || block2.currentMeta().MAF == 0) return false; this->helper.resetUnphased(); - ///////////// + /*//////////// // Calculate - ///////////// + ////////////*/ + const run_type* const a = block1.current(); + const run_type* const b = block2.current(); T currentLengthA = a[0].runs; T currentLengthB = b[0].runs; @@ -493,10 +509,10 @@ bool TomahawkCalculateSlave::CalculateLDUnphased(const controller_type& a, co } // Exit condition - if(pointerA == a.meta[a.metaPointer].runs || pointerB == b.meta[b.metaPointer].runs){ + if(pointerA == block1.currentMeta().runs || pointerB == block2.currentMeta().runs){ //std::cerr << pointerA << '/' << a.meta[a.metaPointer].runs << '\t' << pointerB << '/' << b.meta[b.metaPointer].runs << std::endl; - if(pointerA != a.meta[a.metaPointer].runs || pointerB != b.meta[b.metaPointer].runs){ - std::cerr << Tomahawk::Helpers::timestamp("FATAL") << "Failed to exit equally!\n" << pointerA << "/" << a.meta[a.metaPointer].runs << " and " << pointerB << "/" << b.meta[b.metaPointer].runs << std::endl; + if(pointerA != block1.currentMeta().runs || pointerB != block2.currentMeta().runs){ + std::cerr << Tomahawk::Helpers::timestamp("FATAL") << "Failed to exit equally!\n" << pointerA << "/" << block1.currentMeta().runs << " and " << pointerB << "/" << block2.currentMeta().runs << std::endl; exit(1); } break; @@ -511,16 +527,16 @@ bool TomahawkCalculateSlave::CalculateLDUnphased(const controller_type& a, co #if SLAVE_DEBUG_MODE == 4 || SLAVE_DEBUG_MODE == 5 auto t1 = std::chrono::high_resolution_clock::now(); auto ticks_per_iter = Cycle(t1-t0); - std::cout << a.currentMeta().MAF*this->samples + b.currentMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\t'; + std::cout << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\t'; #endif - this->setFLAGs(a, b); + this->setFLAGs(block1, block2); return(this->CalculateLDUnphasedMath()); } template -double TomahawkCalculateSlave::EstimateChiSq(const double& target, const double& p, const double& q) const{ +double LDSlave::EstimateChiSq(const double& target, const double& p, const double& q) const{ const double f12 = p - target; const double f21 = q - target; const double f22 = 1 - (target + f12 + f21); @@ -543,11 +559,11 @@ double TomahawkCalculateSlave::EstimateChiSq(const double& target, const doub chisq2212 = e2212 > 0 ? pow(this->helper[81] + this->helper[84] - e2212, 2) / e2212 : 0, chisq2222 = e2222 > 0 ? pow(this->helper[85] - e2222, 2) / e2222 : 0; - return(chisq1111+chisq1112+chisq1122+chisq1211+chisq1212+chisq1222+chisq2211+chisq2212+chisq2222); + return(chisq1111 + chisq1112 + chisq1122+chisq1211+chisq1212+chisq1222+chisq2211+chisq2212+chisq2222); } template -bool TomahawkCalculateSlave::ChooseF11Calculate(const double& target, const double& p, const double& q){ +bool LDSlave::ChooseF11Calculate(const double& target, const double& p, const double& q){ this->helper.haplotypeCounts[0] = target; this->helper.haplotypeCounts[1] = p - this->helper.haplotypeCounts[0]; this->helper.haplotypeCounts[2] = q - this->helper.haplotypeCounts[0]; @@ -559,8 +575,9 @@ bool TomahawkCalculateSlave::ChooseF11Calculate(const double& target, const d this->helper[4] = this->helper.haplotypeCounts[2] * 2*this->helper.totalAlleleCounts; this->helper[5] = this->helper.haplotypeCounts[3] * 2*this->helper.totalAlleleCounts; - this->helper.D = this->helper.haplotypeCounts[0] * this->helper.haplotypeCounts[3] - this->helper.haplotypeCounts[1] * this->helper.haplotypeCounts[2]; + this->helper.D = this->helper.haplotypeCounts[0] * this->helper.haplotypeCounts[3] - this->helper.haplotypeCounts[1] * this->helper.haplotypeCounts[2]; this->helper.R2 = this->helper.D*this->helper.D / (p * (1 - p) * q * (1 - q)); + this->helper.R = this->helper.D / sqrt(p * (1 - p) * q * (1 - q)); if(this->helper.countAlternatives() < this->parameters.minimum_alleles) return false; @@ -577,7 +594,7 @@ bool TomahawkCalculateSlave::ChooseF11Calculate(const double& target, const d } this->helper.Dprime = this->helper.D / this->helper.Dmax; - if(this->helper.D < 0) + if(this->helper.D < 0) this->helper.P = this->fisherController.fisherTestLess(round(this->helper[0]),round(this->helper[1]),round(this->helper[4]),round(this->helper[5])); else this->helper.P = this->fisherController.fisherTestGreater(round(this->helper[0]),round(this->helper[1]),round(this->helper[4]),round(this->helper[5])); @@ -597,7 +614,7 @@ bool TomahawkCalculateSlave::ChooseF11Calculate(const double& target, const d } template -bool TomahawkCalculateSlave::CalculateLDUnphasedMath(void){ +bool LDSlave::CalculateLDUnphasedMath(void){ // Total amount of non-missing alleles this->helper.totalAlleleCounts = this->helper[0] + this->helper[1] + this->helper[4] + this->helper[5] + this->helper[16] + this->helper[17] + this->helper[20] + this->helper[21] @@ -651,10 +668,10 @@ bool TomahawkCalculateSlave::CalculateLDUnphasedMath(void){ //const double n21 = (2.0*this->helper[80] + this->helper[81] + this->helper[84] + this->helper[16] + this->helper[64]); //const double n22 = (2.0*this->helper[85] + this->helper[81] + this->helper[84] + this->helper[21] + this->helper[85]); - ///////////////////////// + /*//////////////////////// // Cubic function: a3x^3 + a2x^2 + a1x + d = 0 <==> ax^3 + bx^2 + cx + d = 0 // Cubic constants - ///////////////////////// + ////////////////////////*/ const double G = 1.0 - 2.0*p - 2.0*q; const double dee = -n11*p*q; const double c = -n11*G - number_of_hets*(1.0 - p - q) + 2.0*this->helper.totalAlleleCounts*p*q; @@ -779,7 +796,7 @@ bool TomahawkCalculateSlave::CalculateLDUnphasedMath(void){ } template -bool TomahawkCalculateSlave::CalculateLDUnphasedVectorizedNoMissing(const controller_type& a, const controller_type& b){ +bool LDSlave::CalculateLDUnphasedVectorizedNoMissing(const block_type& block1, const block_type& block2){ this->helper.resetUnphased(); this->helper_simd.counters[0] = 0; @@ -793,17 +810,17 @@ bool TomahawkCalculateSlave::CalculateLDUnphasedVectorizedNoMissing(const con this->helper_simd.counters[8] = 0; this->helper_simd.counters[9] = 0; - const simd_pair& datA = a.packed->getData(a.metaPointer); - const simd_pair& datB = b.packed->getData(b.metaPointer); + const simd_pair& datA = block1.currentBitvector(); + const simd_pair& datB = block2.currentBitvector(); const BYTE* const arrayA = datA.data; const BYTE* const arrayB = datB.data; #if SIMD_AVAILABLE == 1 const U32 frontSmallest = datA.frontZero < datB.frontZero ? datA.frontZero : datB.frontZero; - const U32 tailSmallest = datA.tailZero < datB.tailZero ? datA.tailZero : datB.tailZero; + const U32 tailSmallest = datA.tailZero < datB.tailZero ? datA.tailZero : datB.tailZero; U32 i = frontSmallest; - const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; - const U32 tailBonus = (datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero); + const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; + const U32 tailBonus = (datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero); const VECTOR_TYPE* const vectorA = (const VECTOR_TYPE* const)arrayA; const VECTOR_TYPE* const vectorB = (const VECTOR_TYPE* const)arrayB; @@ -902,15 +919,15 @@ bool TomahawkCalculateSlave::CalculateLDUnphasedVectorizedNoMissing(const con std::cout << ticks_per_iter.count() << '\n'; #endif - this->setFLAGs(a, b); + this->setFLAGs(block1, block2); return(this->CalculateLDUnphasedMath()); } template -bool TomahawkCalculateSlave::CalculateLDUnphasedVectorized(const controller_type& a, const controller_type& b){ +bool LDSlave::CalculateLDUnphasedVectorized(const block_type& block1, const block_type& block2){ #if SLAVE_DEBUG_MODE < 6 - if(a.currentMeta().missing == 0 && b.currentMeta().missing == 0) - return(this->CalculateLDUnphasedVectorizedNoMissing(a, b)); + if(block1.currentMeta().missing == 0 && block2.currentMeta().missing == 0) + return(this->CalculateLDUnphasedVectorizedNoMissing(block1, block2)); #endif this->helper.resetUnphased(); @@ -926,24 +943,24 @@ bool TomahawkCalculateSlave::CalculateLDUnphasedVectorized(const controller_t this->helper_simd.counters[8] = 0; this->helper_simd.counters[9] = 0; - const simd_pair& datA = a.packed->getData(a.metaPointer); - const simd_pair& datB = b.packed->getData(b.metaPointer); - const BYTE* const arrayA = datA.data; - const BYTE* const arrayB = datB.data; + const simd_pair& datA = block1.currentBitvector(); + const simd_pair& datB = block2.currentBitvector(); + const BYTE* const arrayA = datA.data; + const BYTE* const arrayB = datB.data; const BYTE* const arrayA_mask = datA.mask; const BYTE* const arrayB_mask = datB.mask; #if SIMD_AVAILABLE == 1 const U32 frontSmallest = datA.frontZero < datB.frontZero ? datA.frontZero : datB.frontZero; - const U32 tailSmallest = datA.tailZero < datB.tailZero ? datA.tailZero : datB.tailZero; + const U32 tailSmallest = datA.tailZero < datB.tailZero ? datA.tailZero : datB.tailZero; U32 i = frontSmallest; - const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; - const U32 tailBonus = (datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero); + const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; + const U32 tailBonus = (datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero); //std::cerr << frontSmallest << '\t' << tailSmallest << std::endl; - const VECTOR_TYPE* const vectorA = (const VECTOR_TYPE* const)arrayA; - const VECTOR_TYPE* const vectorB = (const VECTOR_TYPE* const)arrayB; + const VECTOR_TYPE* const vectorA = (const VECTOR_TYPE* const)arrayA; + const VECTOR_TYPE* const vectorB = (const VECTOR_TYPE* const)arrayB; const VECTOR_TYPE* const vectorA_mask = (const VECTOR_TYPE* const)arrayA_mask; const VECTOR_TYPE* const vectorB_mask = (const VECTOR_TYPE* const)arrayB_mask; VECTOR_TYPE altalt, refref, altref, refalt; @@ -1044,15 +1061,15 @@ bool TomahawkCalculateSlave::CalculateLDUnphasedVectorized(const controller_t std::cout << ticks_per_iter.count() << '\n'; #endif - this->setFLAGs(a, b); + this->setFLAGs(block1, block2); return(this->CalculateLDUnphasedMath()); } template -bool TomahawkCalculateSlave::CalculateLDPhasedVectorized(const controller_type& a, const controller_type& b){ +bool LDSlave::CalculateLDPhasedVectorized(const block_type& block1, const block_type& block2){ #if SLAVE_DEBUG_MODE < 6 - if(a.currentMeta().missing == 0 && b.currentMeta().missing == 0) - return(this->CalculateLDPhasedVectorizedNoMissing(a, b)); + if(block1.currentMeta().missing == 0 && block2.currentMeta().missing == 0) + return(this->CalculateLDPhasedVectorizedNoMissing(block1, block2)); #endif this->helper.resetPhased(); @@ -1061,22 +1078,22 @@ bool TomahawkCalculateSlave::CalculateLDPhasedVectorized(const controller_typ this->helper_simd.counters[2] = 0; this->helper_simd.counters[3] = 0; - const simd_pair& datA = a.packed->getData(a.metaPointer); - const simd_pair& datB = b.packed->getData(b.metaPointer); - const BYTE* const arrayA = datA.data; - const BYTE* const arrayB = datB.data; + const simd_pair& datA = block1.currentBitvector(); + const simd_pair& datB = block2.currentBitvector(); + const BYTE* const arrayA = datA.data; + const BYTE* const arrayB = datB.data; const BYTE* const arrayA_mask = datA.mask; const BYTE* const arrayB_mask = datB.mask; #if SIMD_AVAILABLE == 1 const U32 frontSmallest = datA.frontZero < datB.frontZero ? datA.frontZero : datB.frontZero; - const U32 tailSmallest = datA.tailZero < datB.tailZero ? datA.tailZero : datB.tailZero; + const U32 tailSmallest = datA.tailZero < datB.tailZero ? datA.tailZero : datB.tailZero; U32 i = frontSmallest; - const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; - const U32 tailBonus = (datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero); + const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; + const U32 tailBonus = (datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero); - const VECTOR_TYPE* const vectorA = (const VECTOR_TYPE* const)arrayA; - const VECTOR_TYPE* const vectorB = (const VECTOR_TYPE* const)arrayB; + const VECTOR_TYPE* const vectorA = (const VECTOR_TYPE* const)arrayA; + const VECTOR_TYPE* const vectorB = (const VECTOR_TYPE* const)arrayB; const VECTOR_TYPE* const vectorA_mask = (const VECTOR_TYPE* const)arrayA_mask; const VECTOR_TYPE* const vectorB_mask = (const VECTOR_TYPE* const)arrayB_mask; VECTOR_TYPE __intermediate, masks; @@ -1163,23 +1180,23 @@ bool TomahawkCalculateSlave::CalculateLDPhasedVectorized(const controller_typ #if SLAVE_DEBUG_MODE == 4 || SLAVE_DEBUG_MODE == 5 auto t1 = std::chrono::high_resolution_clock::now(); auto ticks_per_iter = Cycle(t1-t0); - std::cout << "V\t" << a.currentMeta().MAF*this->samples + b.currentMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\n'; + std::cout << "V\t" << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\n'; #endif - this->setFLAGs(a, b); + this->setFLAGs(block1, block2); return(this->CalculateLDPhasedMath()); } template -bool TomahawkCalculateSlave::CalculateLDPhasedVectorizedNoMissing(const controller_type& a, const controller_type& b){ +bool LDSlave::CalculateLDPhasedVectorizedNoMissing(const block_type& block1, const block_type& block2){ this->helper.resetPhased(); this->helper_simd.counters[0] = 0; this->helper_simd.counters[1] = 0; this->helper_simd.counters[2] = 0; this->helper_simd.counters[3] = 0; - const simd_pair& datA = a.packed->getData(a.metaPointer); - const simd_pair& datB = b.packed->getData(b.metaPointer); + const simd_pair& datA = block1.currentBitvector(); + const simd_pair& datB = block2.currentBitvector(); const BYTE* const arrayA = datA.data; const BYTE* const arrayB = datB.data; @@ -1187,8 +1204,8 @@ bool TomahawkCalculateSlave::CalculateLDPhasedVectorizedNoMissing(const contr const U32 frontSmallest = datA.frontZero < datB.frontZero ? datA.frontZero : datB.frontZero; const U32 tailSmallest = datA.tailZero < datB.tailZero ? datA.tailZero : datB.tailZero; U32 i = frontSmallest; - const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; - const U32 tailBonus = datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero; + const U32 frontBonus = datA.frontZero != frontSmallest ? datA.frontZero : datB.frontZero; + const U32 tailBonus = datA.tailZero != tailSmallest ? datA.tailZero : datB.tailZero; const VECTOR_TYPE* const vectorA = (const VECTOR_TYPE* const)arrayA; const VECTOR_TYPE* const vectorB = (const VECTOR_TYPE* const)arrayB; @@ -1265,17 +1282,17 @@ bool TomahawkCalculateSlave::CalculateLDPhasedVectorizedNoMissing(const contr #if SLAVE_DEBUG_MODE == 4 || SLAVE_DEBUG_MODE == 5 auto t1 = std::chrono::high_resolution_clock::now(); auto ticks_per_iter = Cycle(t1-t0); - std::cout << "V\t" << a.currentMeta().MAF*this->samples + b.currentMeta().MAF*this->samples << "\t" << ticks_per_iter.count() << '\n'; + std::cout << "V\t" << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << "\t" << ticks_per_iter.count() << '\n'; #endif - this->setFLAGs(a, b); + this->setFLAGs(block1, block2); return(this->CalculateLDPhasedMath()); } template -bool TomahawkCalculateSlave::CalculateLDPhased(const controller_type& a, const controller_type& b){ - if(a.currentMeta().MAF == 0 || b.currentMeta().MAF == 0) +bool LDSlave::CalculateLDPhased(const block_type& block1, const block_type& block2){ + if(block1.currentMeta().MAF == 0 || block2.currentMeta().MAF == 0) return false; this->helper.resetPhased(); @@ -1284,6 +1301,8 @@ bool TomahawkCalculateSlave::CalculateLDPhased(const controller_type& a, cons auto t0 = std::chrono::high_resolution_clock::now(); #endif + const run_type* const a = block1.current(); + const run_type* const b = block2.current(); T currentLengthA = a[0].runs; T currentLengthB = b[0].runs; @@ -1322,9 +1341,9 @@ bool TomahawkCalculateSlave::CalculateLDPhased(const controller_type& a, cons this->helper[currentMixR] += add; // Exit condition - if(pointerA == a.meta[a.metaPointer].runs || pointerB == b.meta[b.metaPointer].runs){ - if(pointerA != a.meta[a.metaPointer].runs || pointerB != b.meta[b.metaPointer].runs){ - std::cerr << Tomahawk::Helpers::timestamp("FATAL") << "Failed to exit equally!\n" << pointerA << "/" << a.meta[a.metaPointer].runs << " and " << pointerB << "/" << b.meta[b.metaPointer].runs << std::endl; + if(pointerA == block1.currentMeta().runs || pointerB == block2.currentMeta().runs){ + if(pointerA != block1.currentMeta().runs || pointerB != block2.currentMeta().runs){ + std::cerr << Tomahawk::Helpers::timestamp("FATAL") << "Failed to exit equally!\n" << pointerA << "/" << block1.currentMeta().runs << " and " << pointerB << "/" << block2.currentMeta().runs << std::endl; exit(1); } break; @@ -1344,20 +1363,20 @@ bool TomahawkCalculateSlave::CalculateLDPhased(const controller_type& a, cons #if SLAVE_DEBUG_MODE == 4 || SLAVE_DEBUG_MODE == 5 auto t1 = std::chrono::high_resolution_clock::now(); auto ticks_per_iter = Cycle(t1-t0); - std::cout << a.currentMeta().MAF*this->samples + b.currentMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\n'; + std::cout << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\n'; #endif #if SLAVE_DEBUG_MODE == 3 - std::cout << a.currentMeta().runs << '\t' << b.currentMeta().runs << '\t' << iterations << std::endl; + std::cout << a.getMeta().runs << '\t' << b.getMeta().runs << '\t' << iterations << std::endl; #endif - this->setFLAGs(a, b); + this->setFLAGs(block1, block2); return(this->CalculateLDPhasedMath()); } template -bool TomahawkCalculateSlave::CalculateLDPhasedMath(void){ +bool LDSlave::CalculateLDPhasedMath(void){ // Trigger phased flag this->helper.setPhased(); @@ -1382,8 +1401,9 @@ bool TomahawkCalculateSlave::CalculateLDPhasedMath(void){ this->helper.haplotypeCounts[2] = (this->helper[0] + this->helper[4]) / this->helper.totalAlleleCounts; this->helper.haplotypeCounts[3] = (this->helper[1] + this->helper[5]) / this->helper.totalAlleleCounts; - this->helper.D = this->helper[0]/this->helper.totalAlleleCounts * this->helper[5]/this->helper.totalAlleleCounts - this->helper[1]/this->helper.totalAlleleCounts * this->helper[4]/this->helper.totalAlleleCounts; + this->helper.D = this->helper[0]/this->helper.totalAlleleCounts * this->helper[5]/this->helper.totalAlleleCounts - this->helper[1]/this->helper.totalAlleleCounts * this->helper[4]/this->helper.totalAlleleCounts; this->helper.R2 = this->helper.D*this->helper.D / (((this->helper.haplotypeCounts[0] > 0 ? this->helper.haplotypeCounts[0] : 1) * (this->helper.haplotypeCounts[1] > 0 ? this->helper.haplotypeCounts[1] : 1) * (this->helper.haplotypeCounts[2] > 0 ? this->helper.haplotypeCounts[2] : 1) * (this->helper.haplotypeCounts[3] > 0 ? this->helper.haplotypeCounts[3] : 1))); + this->helper.R = this->helper.D / sqrt((((this->helper.haplotypeCounts[0] > 0 ? this->helper.haplotypeCounts[0] : 1) * (this->helper.haplotypeCounts[1] > 0 ? this->helper.haplotypeCounts[1] : 1) * (this->helper.haplotypeCounts[2] > 0 ? this->helper.haplotypeCounts[2] : 1) * (this->helper.haplotypeCounts[3] > 0 ? this->helper.haplotypeCounts[3] : 1)))); if(this->helper.R2 >= this->parameters.R2_min && this->helper.R2 <= this->parameters.R2_max){ if(this->helper.D >= 0){ @@ -1426,16 +1446,16 @@ bool TomahawkCalculateSlave::CalculateLDPhasedMath(void){ // Execute diagonal working order template -bool TomahawkCalculateSlave::DiagonalWorkOrder(const order_type& order){ +bool LDSlave::DiagonalWorkOrder(const order_type& order){ for(U32 i = order.fromRow; i < order.toRow; ++i){ - controller_type block1(this->manager[i]); + block_type block1(this->manager[i]); for(U32 j = i; j < order.toColumn; ++j){ //std::cerr << Helpers::timestamp("DEBUG", "DIAG") << i << '/' << j << '\t' << order << std::endl; if(i == j) this->CompareBlocks(block1); else { - controller_type block2(this->manager[j]); + block_type block2(this->manager[j]); this->CompareBlocks(block1, block2); } } @@ -1445,19 +1465,19 @@ bool TomahawkCalculateSlave::DiagonalWorkOrder(const order_type& order){ // Execute square working order template -bool TomahawkCalculateSlave::SquareWorkOrder(const order_type& order){ +bool LDSlave::SquareWorkOrder(const order_type& order){ if(order.staggered) return(this->DiagonalWorkOrder(order)); for(U32 i = order.fromRow; i < order.toRow; ++i){ - controller_type block1(this->manager[i]); + block_type block1(this->manager[i]); for(U32 j = order.fromColumn; j < order.toColumn; ++j){ //std::cerr << Helpers::timestamp("DEBUG", "SQUARE") << i << '/' << j << '\t' << order << std::endl; if(i == j) this->CompareBlocks(block1); else { - controller_type block2(this->manager[j]); + block_type block2(this->manager[j]); this->CompareBlocks(block1, block2); } } @@ -1466,7 +1486,7 @@ bool TomahawkCalculateSlave::SquareWorkOrder(const order_type& order){ } template -bool TomahawkCalculateSlave::Calculate(void){ +bool LDSlave::Calculate(void){ // If there is no data if(this->manager.size() == 0){ std::cerr << Helpers::timestamp("ERROR", "CONTROLLER") << "There is no data..." << std::endl; @@ -1487,13 +1507,13 @@ bool TomahawkCalculateSlave::Calculate(void){ } // Finish the output manager - this->output_manager.flushBlock(); + this->output_writer.flush(); return true; } template -void TomahawkCalculateSlave::CompareBlocksFunction(const controller_type& block1, const controller_type block2){ +void LDSlave::CompareBlocksFunction(const block_type& block1, const block_type& block2){ #if SLAVE_DEBUG_MODE == 1 // 1 = No debug mode // Ignore when one or both is invariant if(block1.currentMeta().MAF == 0 || block2.currentMeta().MAF == 0 || block1.currentMeta().runs == 1 || block2.currentMeta().runs == 1){ @@ -1503,25 +1523,33 @@ void TomahawkCalculateSlave::CompareBlocksFunction(const controller_type& blo if(block1.currentMeta().phased == 1 && block2.currentMeta().phased == 1){ if(block1.currentMeta().MAF+block2.currentMeta().MAF <= 0.004792332){ - if(this->CalculateLDPhased(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDPhased(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } else { - if(this->CalculateLDPhasedVectorized(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDPhasedVectorized(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } } else { if(block1.currentMeta().MAF+block2.currentMeta().MAF <= 0.009784345){ - if(this->CalculateLDUnphased(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDUnphased(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } else { - if(this->CalculateLDUnphasedVectorized(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDUnphasedVectorized(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } } #elif SLAVE_DEBUG_MODE == 2 if(this->CalculateLDPhasedVectorizedNoMissing(block1, block2)){ - this->output_manager.Add(block1, block2, this->helper); + this->output_writer.Add(block1, block2, this->helper); } #elif SLAVE_DEBUG_MODE == 3 this->CalculateLDPhased(block1, block2); @@ -1559,58 +1587,70 @@ void TomahawkCalculateSlave::CompareBlocksFunction(const controller_type& blo } template -void TomahawkCalculateSlave::CompareBlocksFunctionForcedPhased(const controller_type& block1, const controller_type block2){ +void LDSlave::CompareBlocksFunctionForcedPhased(const block_type& block1, const block_type& block2){ // Ignore when one or both is invariant - if(block1.currentMeta().MAF == 0 || block2.currentMeta().MAF == 0 || block1.currentMeta().runs == 1 || block2.currentMeta().runs == 1){ + if(block1.currentMeta().MAF == 0 || block2.currentMeta().MAF == 0 || + block1.currentMeta().runs == 1 || block2.currentMeta().runs == 1) + { //std::cerr << "invariant" << std::endl; return; } if(block1.currentMeta().MAF+block2.currentMeta().MAF <= 0.004792332){ - if(this->CalculateLDPhased(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDPhased(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } else { - if(this->CalculateLDPhasedVectorized(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDPhasedVectorized(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } } template -void TomahawkCalculateSlave::CompareBlocksFunctionForcedUnphased(const controller_type& block1, const controller_type block2){ +void LDSlave::CompareBlocksFunctionForcedUnphased(const block_type& block1, const block_type& block2){ // Ignore when one or both is invariant - if(block1.currentMeta().MAF == 0 || block2.currentMeta().MAF == 0 || block1.currentMeta().runs == 1 || block2.currentMeta().runs == 1){ + if(block1.currentMeta().MAF == 0 || block2.currentMeta().MAF == 0 || + block1.currentMeta().runs == 1 || block2.currentMeta().runs == 1) + { //std::cerr << "invariant" << std::endl; return; } if(block1.currentMeta().MAF+block2.currentMeta().MAF <= 0.009784345){ - if(this->CalculateLDUnphased(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDUnphased(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } else { - if(this->CalculateLDUnphasedVectorized(block1, block2)) - this->output_manager.Add(block1, block2, this->helper); + if(this->CalculateLDUnphasedVectorized(block1, block2)){ + this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), this->helper); + //std::cerr << this->helper.R2 << '\n'; + } } } // Within-block comparisons template -bool TomahawkCalculateSlave::CompareBlocks(controller_type& block1){ +bool LDSlave::CompareBlocks(block_type& block1){ //std::cerr << Helpers::timestamp("DEBUG", "DIAG-INTERNAL") << *block1.support << '\t' << (block1.size()*block1.size()-block1.size())/2 << std::endl; - block1.reset(); // make sure it is reset - controller_type block2(block1); + block1.resetIterator(); // make sure it is reset + block_type block2(block1); for(U32 i = 0; i < block1.size(); ++i){ block2 = block1; ++block2; // block2 starts at relative +1 - for(U32 j = i + 1; j < block2.size(); ++j){ + for(U32 j = i + 1; j < block1.size(); ++j){ (this->*phase_function_across)(block1, block2); ++block2; } // Update progress - this->progress(block1.size() - (i + 1), this->output_manager.GetProgressCounts()); - this->output_manager.ResetProgress(); + this->progress(block1.size() - (i + 1), this->output_writer.getProgressCounts()); + this->output_writer.ResetProgress(); ++block1; } return true; @@ -1618,11 +1658,11 @@ bool TomahawkCalculateSlave::CompareBlocks(controller_type& block1){ // Across block comparisons template -bool TomahawkCalculateSlave::CompareBlocks(controller_type& block1, controller_type block2){ +bool LDSlave::CompareBlocks(block_type& block1, block_type& block2){ // Reset // Make sure pointers are the beginning - block1.reset(); - block2.reset(); + block1.resetIterator(); + block2.resetIterator(); // Cycle over block 1 and block 2 for(U32 i = 0; i < block1.size(); ++i){ @@ -1632,11 +1672,11 @@ bool TomahawkCalculateSlave::CompareBlocks(controller_type& block1, controlle } // Update progress - this->progress(block2.size(), this->output_manager.GetProgressCounts()); - this->output_manager.ResetProgress(); + this->progress(block2.size(), this->output_writer.getProgressCounts()); + this->output_writer.ResetProgress(); // Reset position in block2 and increment position in block1 - block2.reset(); + block2.resetIterator(); ++block1; } return true; @@ -1644,4 +1684,4 @@ bool TomahawkCalculateSlave::CompareBlocks(controller_type& block1, controlle } -#endif /* TOMAHAWK_TOMAHAWKCALCULATESLAVE_H_ */ +#endif /* TOMAHAWK_LD_CALCULATION_SLAVE_H_ */ diff --git a/src/tomahawk/base/TomahawkEntryMeta.h b/src/tomahawk/meta_entry.h similarity index 57% rename from src/tomahawk/base/TomahawkEntryMeta.h rename to src/tomahawk/meta_entry.h index 9fc0314..a486a48 100644 --- a/src/tomahawk/base/TomahawkEntryMeta.h +++ b/src/tomahawk/meta_entry.h @@ -3,7 +3,7 @@ namespace Tomahawk{ // Size of meta entry BEFORE run entries -#define TOMAHAWK_ENTRY_META_SIZE (sizeof(U32) + sizeof(BYTE) + 2*sizeof(float)) +#define TOMAHAWK_ENTRY_META_SIZE (sizeof(U32) + sizeof(BYTE) + 2*sizeof(float)) /* TomahawkEntryMetaBase is used for reinterpreting @@ -11,12 +11,12 @@ namespace Tomahawk{ Number of runs can be inferred from the sample number and byte length of the stream */ -#pragma pack(1) -struct TomahawkEntryMetaBase{ - typedef TomahawkEntryMetaBase self_type; +#pragma pack(push, 1) +struct __attribute__((packed, aligned(1))) MetaEntryBase{ + typedef MetaEntryBase self_type; public: - TomahawkEntryMetaBase() : + MetaEntryBase() : missing(0), phased(0), position(0), @@ -24,20 +24,26 @@ struct TomahawkEntryMetaBase{ MAF(0), HWE_P(0) {} - ~TomahawkEntryMetaBase(){} - bool isSingleton(void) const{ return(this->MAF == 0); } + MetaEntryBase(const char* const buffer_stream){ memcpy(this, buffer_stream, sizeof(self_type)); } + ~MetaEntryBase() = default; + inline bool isSingleton(void) const{ return(this->MAF == 0); } + inline const char getRefAllele(void) const{ return(Constants::REF_ALT_LOOKUP[this->ref_alt >> 4]); } + inline const char getAltAllele(void) const{ return(Constants::REF_ALT_LOOKUP[this->ref_alt & ((1 << 4) - 1)]); } + inline const char getPhaseVCFCharacter(void) const{ return(this->phased == 1 ? '|' : '/'); } + + // Overloaded operator for debug use friend std::ostream& operator<<(std::ostream& out, const self_type& entry){ out << entry.position << '\t' << (int)entry.ref_alt << '\t' << entry.MAF << '\t' << entry.HWE_P; return(out); } public: - const U32 missing: 1, phased: 1, position: 30; - const BYTE ref_alt; - const float MAF; - const float HWE_P; + U32 missing: 1, phased: 1, position: 30; + BYTE ref_alt; + float MAF; + float HWE_P; }; /* @@ -45,16 +51,25 @@ struct TomahawkEntryMetaBase{ regaring a variant line such as position, if any genotypes are missing and if the data is phased. */ -#pragma pack(1) template -struct TomahawkEntryMeta : public TomahawkEntryMetaBase{ - typedef TomahawkEntryMeta self_type; +struct __attribute__((packed, aligned(1))) MetaEntry : public MetaEntryBase{ + typedef MetaEntry self_type; + typedef MetaEntryBase parent_type; public: - TomahawkEntryMeta() : + MetaEntry() : runs(0) {} - ~TomahawkEntryMeta(){} + + // Copy from stream + MetaEntry(const char* const buffer_stream) : + parent_type(buffer_stream), + runs(*reinterpret_cast(&buffer_stream[TOMAHAWK_ENTRY_META_SIZE])) + { + //std::cerr << "runs: " << (int)this->runs << std::endl; + } + + ~MetaEntry(){} inline bool isValid(void) const{ return(this->runs > 0); } @@ -76,6 +91,7 @@ struct TomahawkEntryMeta : public TomahawkEntryMetaBase{ public: T runs; // number of runs }; +#pragma pack(pop) } diff --git a/src/tomahawk/output_container.h b/src/tomahawk/output_container.h new file mode 100644 index 0000000..b6af7f2 --- /dev/null +++ b/src/tomahawk/output_container.h @@ -0,0 +1,188 @@ +#ifndef TOMAHAWK_BASE_OUTPUT_CONTAINER_H_ +#define TOMAHAWK_BASE_OUTPUT_CONTAINER_H_ + +#include + +#include "two/output_entry.h" + +namespace Tomahawk{ + +class OutputContainer{ +private: + typedef IO::OutputEntry value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef IO::BasicBuffer buffer_type; + +public: + OutputContainer() : + n_entries(0), + n_capacity(0), + __entries(nullptr) + { + + } + + OutputContainer(const size_t capacity) : + n_entries(0), + n_capacity(capacity), + __entries(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) + { + + } + + OutputContainer(char* const data, const U64 l_data) : + n_entries(l_data / sizeof(value_type)), + n_capacity(n_entries), + __entries(static_cast(::operator new[](this->size()*sizeof(value_type)))) + { + assert(l_data % sizeof(value_type) == 0); + + U32 cumulative_position = 0; + for(size_t i = 0; i < this->size(); ++i){ + new( &this->__entries[i] ) value_type( &data[cumulative_position] ); + cumulative_position += sizeof(value_type); + } + assert(cumulative_position == l_data); + } + + OutputContainer(const buffer_type& data_buffer) : + n_entries(data_buffer.size() / sizeof(value_type)), + n_capacity(n_entries), + __entries(static_cast(::operator new[](this->size()*sizeof(value_type)))) + { + assert(data_buffer.size() % sizeof(value_type) == 0); + + U32 cumulative_position = 0; + for(size_t i = 0; i < this->size(); ++i){ + new( &this->__entries[i] ) value_type( &data_buffer[cumulative_position] ); + cumulative_position += sizeof(value_type); + } + assert(cumulative_position == data_buffer.size()); + } + + ~OutputContainer(void){ + for(size_type i = 0; i < this->size(); ++i) + ((this->__entries + i)->~OutputEntry)(); + + ::operator delete[](static_cast(this->__entries)); + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->__entries[position]); } + inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } + inline reference operator[](const size_type& position){ return(this->__entries[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->__entries[position]); } + inline pointer data(void){ return(this->__entries); } + inline const_pointer data(void) const{ return(this->__entries); } + inline reference front(void){ return(this->__entries[0]); } + inline const_reference front(void) const{ return(this->__entries[0]); } + inline reference back(void){ return(this->__entries[this->n_entries - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + inline const size_type& capacity(void) const{ return(this->n_capacity); } + + // Iterator + inline iterator begin(){ return iterator(&this->__entries[0]); } + inline iterator end() { return iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + + // Add + bool operator+=(const_reference entry){ + if(this->size() + 1 > this->capacity()){ + std::cerr << "could not fit!" << std::endl; + return false; + } + + new( &this->__entries[this->size()] ) value_type( entry ); // invoke copy ctor + ++this->n_entries; + return true; + } + + inline bool addData(const buffer_type& buffer){ return(this->addData(buffer.data(), buffer.size())); } + bool addData(const char* const data, const U64 l_data){ + assert(l_data % sizeof(value_type) == 0); + const size_t entries_adding = l_data / sizeof(value_type); + + // Check + if(entries_adding + this->size() > this->capacity()){ + std::cerr << "could not fit!" << std::endl; + return false; + } + + U32 cumulative_position = 0; + size_t start_position = this->size(); + for(size_t i = 0; i < entries_adding; ++i){ + new( &this->__entries[start_position + i] ) value_type( &data[cumulative_position] ); + cumulative_position += sizeof(value_type); + } + assert(cumulative_position == l_data); + + return true; + } + + bool addEntry(const char* const data){ + // Check + if(this->size() + 1 > this->capacity()){ + std::cerr << "could not fit!" << std::endl; + return false; + } + + new ( &this->__entries[this->size()] ) value_type( data ); + + return true; + } + +protected: + size_type n_entries; + size_type n_capacity; + pointer __entries; +}; + +} + +#endif /* TOMAHAWK_BASE_OUTPUT_CONTAINER_H_ */ diff --git a/src/tomahawk/output_container_reference.h b/src/tomahawk/output_container_reference.h new file mode 100644 index 0000000..47fa776 --- /dev/null +++ b/src/tomahawk/output_container_reference.h @@ -0,0 +1,112 @@ +#ifndef TOMAHAWK_BASE_OUTPUT_CONTAINER_REFERENCE_H_ +#define TOMAHAWK_BASE_OUTPUT_CONTAINER_REFERENCE_H_ + +#include + +#include "two/output_entry.h" + +namespace Tomahawk{ + +class OutputContainerReference{ +private: + typedef IO::OutputEntry value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef IO::BasicBuffer buffer_type; + +public: + OutputContainerReference() : + n_entries(0), + __entries(nullptr) + { + + } + + OutputContainerReference(char* const data, const U64 l_data) : + n_entries(l_data / sizeof(value_type)), + __entries(reinterpret_cast(data)) + { + assert(n_entries > 0); + assert(l_data % sizeof(value_type) == 0); + } + + OutputContainerReference(const buffer_type& data_buffer) : + n_entries(data_buffer.size() / sizeof(value_type)), + __entries(reinterpret_cast(data_buffer.buffer)) + { + assert(n_entries >= 0); + assert(data_buffer.size() % sizeof(value_type) == 0); + } + + ~OutputContainerReference(){} + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->__entries[position]); } + inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } + inline reference operator[](const size_type& position){ return(this->__entries[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->__entries[position]); } + inline pointer data(void){ return(this->__entries); } + inline const_pointer data(void) const{ return(this->__entries); } + inline reference front(void){ return(this->__entries[0]); } + inline const_reference front(void) const{ return(this->__entries[0]); } + inline reference back(void){ return(this->__entries[this->n_entries - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + + // Iterator + inline iterator begin(){ return iterator(&this->__entries[0]); } + inline iterator end() { return iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + +protected: + size_type n_entries; + pointer __entries; +}; + +} + +#endif /* TOMAHAWK_BASE_OUTPUT_CONTAINER_REFERENCE_H_ */ diff --git a/src/tomahawk/tomahawk_magic_header.h b/src/tomahawk/tomahawk_magic_header.h new file mode 100644 index 0000000..bcfdfbb --- /dev/null +++ b/src/tomahawk/tomahawk_magic_header.h @@ -0,0 +1,98 @@ +#ifndef TOMAHAWK_TOMAHAWK_MAGIC_HEADER_H_ +#define TOMAHAWK_TOMAHAWK_MAGIC_HEADER_H_ + +#include "../support/MagicConstants.h" + +namespace Tomahawk{ +namespace Base{ + +struct TomahawkMagicHeader{ +public: + typedef TomahawkMagicHeader self_type; + +public: + TomahawkMagicHeader() : + major_version(Tomahawk::Constants::PROGRAM_VERSION_MAJOR), + minor_version(Tomahawk::Constants::PROGRAM_VERSION_MINOR), + file_type(0), + n_samples(0), + n_contigs(0), + controller(0), + l_header(0), + l_header_uncompressed(0) + { + memcpy(&this->magic_string[0], + &Tomahawk::Constants::WRITE_HEADER_MAGIC[0], + Tomahawk::Constants::WRITE_HEADER_MAGIC_LENGTH); + } + + TomahawkMagicHeader(const self_type& other) : + major_version(other.major_version), + minor_version(other.minor_version), + file_type(other.file_type), + n_samples(other.n_samples), + n_contigs(other.n_contigs), + controller(other.controller), + l_header(0), + l_header_uncompressed(0) + { + memcpy(&this->magic_string[0], + &other.magic_string[0], + Tomahawk::Constants::WRITE_HEADER_MAGIC_LENGTH); + } + + ~TomahawkMagicHeader() = default; + + inline const U64& getNumberSamples(void) const{ return(this->n_samples); } + inline U64& getNumberSamples(void){ return(this->n_samples); } + inline const U32& getNumberContigs(void) const{ return(this->n_contigs); } + inline U32& getNumberContigs(void){ return(this->n_contigs); } + + inline bool validateMagic(void) const{ return(strncmp(&this->magic_string[0], &Tomahawk::Constants::WRITE_HEADER_MAGIC[0], Tomahawk::Constants::WRITE_HEADER_MAGIC_LENGTH) == 0); } + inline bool validate(void) const{ + return(this->validateMagic() && this->n_samples > 0 && this->n_contigs > 0 && (this->major_version > 0 || this->minor_version > 0) && this->l_header > 0 && this->l_header_uncompressed > 0); + } + +private: + friend std::ostream& operator<<(std::ostream& stream, const self_type& header){ + stream.write(header.magic_string, Tomahawk::Constants::WRITE_HEADER_MAGIC_LENGTH); + stream.write(reinterpret_cast(&Tomahawk::Constants::PROGRAM_VERSION_MAJOR), sizeof(float)); + stream.write(reinterpret_cast(&Tomahawk::Constants::PROGRAM_VERSION_MINOR), sizeof(float)); + stream.write(reinterpret_cast(&header.file_type), sizeof(BYTE)); + stream.write(reinterpret_cast(&header.n_samples), sizeof(U64)); + stream.write(reinterpret_cast(&header.n_contigs), sizeof(U32)); + stream.write(reinterpret_cast(&header.controller), sizeof(U16)); + stream.write(reinterpret_cast(&header.l_header), sizeof(U32)); + stream.write(reinterpret_cast(&header.l_header_uncompressed), sizeof(U32)); + return stream; + } + + friend std::istream& operator>>(std::istream& stream, self_type& header){ + stream.read(header.magic_string, Tomahawk::Constants::WRITE_HEADER_MAGIC_LENGTH); + stream.read(reinterpret_cast(&header.major_version), sizeof(float)); + stream.read(reinterpret_cast(&header.minor_version), sizeof(float)); + stream.read(reinterpret_cast(&header.file_type), sizeof(BYTE)); + stream.read(reinterpret_cast(&header.n_samples), sizeof(U64)); + stream.read(reinterpret_cast(&header.n_contigs), sizeof(U32)); + stream.read(reinterpret_cast(&header.controller), sizeof(U16)); + stream.read(reinterpret_cast(&header.l_header), sizeof(U32)); + stream.read(reinterpret_cast(&header.l_header_uncompressed), sizeof(U32)); + return(stream); + } + +public: + char magic_string[Tomahawk::Constants::WRITE_HEADER_MAGIC_LENGTH]; + float major_version; + float minor_version; + BYTE file_type; + U64 n_samples; + U32 n_contigs; + U16 controller; + U32 l_header; + U32 l_header_uncompressed; +}; + +} +} + +#endif /* TOMAHAWK_TOMAHAWK_MAGIC_HEADER_H_ */ diff --git a/src/tomahawk/twk_reader_implementation.h b/src/tomahawk/twk_reader_implementation.h new file mode 100644 index 0000000..56b695c --- /dev/null +++ b/src/tomahawk/twk_reader_implementation.h @@ -0,0 +1,150 @@ +#ifndef TOMAHAWK_BASE_TWK_READER_IMPLEMENTATION_H_ +#define TOMAHAWK_BASE_TWK_READER_IMPLEMENTATION_H_ + +#include "genotype_container.h" + +namespace Tomahawk{ + +template +class TomahawkReaderImpl{ +private: + typedef TomahawkReaderImpl self_type; + typedef Base::GenotypeContainer value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef MetaEntry meta_type; + typedef Totempole::IndexEntry header_entry; + typedef Totempole::IndexEntry support_type; + +public: + TomahawkReaderImpl(const U64 n_samples) : + n_entries(0), + n_capacity(0), + n_samples(n_samples), + __entries(nullptr) + { + + } + + TomahawkReaderImpl(const U64 n_samples, const size_t n_capacity) : + n_entries(0), + n_capacity(n_capacity), + n_samples(n_samples), + __entries(static_cast(::operator new[](this->n_capacity*sizeof(value_type)))) + { + + } + + ~TomahawkReaderImpl(){ + for(size_type i = 0; i < this->size(); ++i) + ((this->__entries + i)->~value_type)(); + + ::operator delete[](static_cast(this->__entries)); + } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->__entries[position]); } + inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } + inline reference operator[](const size_type& position){ return(this->__entries[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->__entries[position]); } + inline pointer data(void){ return(this->__entries); } + inline const_pointer data(void) const{ return(this->__entries); } + inline reference front(void){ return(this->__entries[0]); } + inline const_reference front(void) const{ return(this->__entries[0]); } + inline reference back(void){ return(this->__entries[this->n_entries - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + + // Capacity + inline const bool empty(void) const{ return(this->n_entries == 0); } + inline const size_type& size(void) const{ return(this->n_entries); } + inline const size_type& capacity(void) const{ return(this->n_capacity); } + + // Iterator + inline iterator begin(){ return iterator(&this->__entries[0]); } + inline iterator end() { return iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries - 1]); } + + /**< + * Add a `TWK` block as a reference container to the meta container + * @param data Input data reference + * @param l_data Length of input data + * @param support Paired `TWI` header with this block + * @return Returns TRUE upon success or FALSE otherwise + */ + bool addDataBlock(const char* const data, const size_t l_data, const support_type& support){ + // Container is full + // Resize is required + if(this->n_entries + 1 == this->n_capacity || this->capacity() == 0) + return false; + + //std::cerr << "constructing new @ " << this->n_entries << "/" << this->n_capacity << " and samples: " << this->n_samples << std::endl; + new( &this->__entries[this->n_entries] ) value_type( data, l_data, support, this->n_samples ); + ++this->n_entries; + return true; + } + + /**< + * Counts the number of variants referenced in this meta container + * @return Returns the total number of variants in all containers + */ + const U64 countVariants(void) const{ + U64 n_total = 0; + for(U32 i = 0; i < this->size(); ++i) + n_total += this->at(i).getTotempole().size(); + + return(n_total); + } + + const U64& numberSamples(void) const{ return(this->n_samples); } + +private: + size_type n_entries; + size_type n_capacity; + U64 n_samples; + pointer __entries; +}; + +} + +#endif /* TOMAHAWK_BASE_TWK_READER_IMPLEMENTATION_H_ */ diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputReader.cpp b/src/tomahawk/two/TomahawkOutputReader.cpp similarity index 50% rename from src/tomahawk/TomahawkOutput/TomahawkOutputReader.cpp rename to src/tomahawk/two/TomahawkOutputReader.cpp index bd9140f..823f15e 100644 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputReader.cpp +++ b/src/tomahawk/two/TomahawkOutputReader.cpp @@ -1,63 +1,666 @@ -#include -#include -#include +#include "../two/TomahawkOutputReader.h" +#include +#include +#include +#include + +#include "../../io/compression/GZFConstants.h" +#include "../../io/compression/GZFHeader.h" #include "../../support/helpers.h" -#include "../../support/MagicConstants.h" -#include "../../algorithm/OpenHashTable.h" -#include "TomahawkOutputReader.h" -#include "../../algorithm/sort/TomahawkOutputSort.h" -#include "TomahawkOutputWriter.h" -#include "TomahawkOutputStats.h" +#include "../two/TomahawkOutputStats.h" namespace Tomahawk { -namespace IO { TomahawkOutputReader::TomahawkOutputReader() : - filesize(0), - position(0), - size(0), - hasIndex(false), - output_header(true), - writer_output_type(WRITER_TYPE::natural), - writer(nullptr), - contigs(nullptr), - contig_htable(nullptr), + filesize_(0), + offset_end_of_data_(0), + showHeader_(true), + index_(nullptr), + buffer_(3000000), + data_(3000000), + outputBuffer_(3000000), interval_tree(nullptr), - interval_tree_entries(nullptr), - interval_totempole_enties(nullptr) -{} + interval_tree_entries(nullptr) +{ + +} TomahawkOutputReader::~TomahawkOutputReader(){ - delete [] this->contigs; - delete contig_htable; - if(interval_tree != nullptr){ - for(U32 i = 0; i < this->header.n_contig; ++i) + delete this->index_; + this->buffer_.deleteAll(); + this->data_.deleteAll(); + this->outputBuffer_.deleteAll(); + delete [] this->interval_tree_entries; + if(this->interval_tree != nullptr){ + for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i){ delete this->interval_tree[i]; + } + delete [] this->interval_tree; + } +} + +bool TomahawkOutputReader::open(const std::string input){ + if(input.size() == 0){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "No input filename..." << std::endl; + return false; + } + + this->stream_.open(input, std::ios::in | std::ios::binary | std::ios::ate); + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to open file handle: " << input << std::endl; + } + this->filesize_ = this->stream_.tellg(); + + this->stream_.seekg(this->filesize_ - TWK_FOOTER_LENGTH); + this->stream_ >> this->footer_; + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Stream corrupted after loading footer..." << std::endl; + return false; + } + + if(this->footer_.validate() == false){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to validate footer..." << std::endl; + return false; + } + + // Seek to start of index + this->stream_.seekg(this->footer_.offset_end_of_data); + const U32 l_index_data = (this->filesize_ - TWK_FOOTER_LENGTH) - this->stream_.tellg(); + buffer_type index_buffer(l_index_data + 1024); + this->stream_.read(index_buffer.data(), l_index_data); + index_buffer.n_chars = l_index_data; + this->index_ = new index_type(index_buffer.data(), index_buffer.size()); + index_buffer.deleteAll(); + + // Resize buffers to accomodate the largest possible block + // without ever resizing + // this is for performance reasons + this->buffer_.resize(this->getFooter().getLargestUncompressedBlock() + 64); + this->data_.resize(this->getFooter().getLargestUncompressedBlock() + 64); + this->outputBuffer_.resize(this->getFooter().getLargestUncompressedBlock() + 64); + + // Seek to beginning + this->stream_.seekg(0); + if(!this->header_.open(this->stream_)){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to load header data..." << std::endl; + return false; + } + + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Stream is bad..." << std::endl; + return false; + } + + if(this->header_.validate() == false){ + std::cerr << Helpers::timestamp("ERROR", "TOMAHAWK") << "Failed to validate header..." << std::endl; + return false; + } + + this->offset_end_of_data_ = this->footer_.offset_end_of_data; + + return true; +} + +int TomahawkOutputReader::parseBlock(const bool clear){ + // Stream died + if(this->stream_.good() == false){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Stream died!" << std::endl; + return -1; + } + + // EOF + // tellg will always return a positive value here + // or it would've failed at good() check + if((U64)this->stream_.tellg() == this->offset_end_of_data_) + return 0; + + // Read TGZF header + this->buffer_.resize(sizeof(tgzf_header_type)); + this->stream_.read(this->buffer_.data(), IO::Constants::TGZF_BLOCK_HEADER_LENGTH); + const tgzf_header_type* h = reinterpret_cast(this->buffer_.data()); + this->buffer_.n_chars = IO::Constants::TGZF_BLOCK_HEADER_LENGTH; + if(!h->Validate()){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed to validate!" << std::endl; + return -2; + } + + this->buffer_.resize(h->BSIZE); // make sure all data will fit + + // Recast because if compressed_buffer is (actually) resized then the pointer address is incorrect + // resulting in segfault + h = reinterpret_cast(this->buffer_.data()); + + this->stream_.read(&this->buffer_.buffer[IO::Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - IO::Constants::TGZF_BLOCK_HEADER_LENGTH); + if(!this->stream_.good()){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Truncated file..." << std::endl; + return -3; + } + + this->buffer_.n_chars = h->BSIZE; + const U32 uncompressed_size = *reinterpret_cast(&this->buffer_[this->buffer_.size() - sizeof(U32)]); + + // Clear output compressed_buffer + if(clear) { + this->data_.reset(); + this->data_.resize(uncompressed_size); + } else { // Otherwise resize to permit data + this->data_.resize(this->data_.size() + uncompressed_size); + } + + if(!this->tgzf_controller_.Inflate(this->buffer_, this->data_)){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed inflate!" << std::endl; + return -4; + } + + if(this->data_.size() == 0){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Empty data!" << std::endl; + return 0; + } + + // Reset compressed_buffer + this->buffer_.reset(); + + // Reset iterator position and size + //this->iterator_position_block = 0; + //this->size = this->data_.size() / sizeof(entry_type); + + // Validity check + if(this->data_.size() % sizeof(entry_type) != 0){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Data is corrupted!" << std::endl; + return -5; + } + + return 1; +} + +int TomahawkOutputReader::parseBlock(std::ifstream& stream, + buffer_type& inflate_buffer, + buffer_type& data_buffer, + tgzf_controller_type& compression_manager, + const bool clear) const +{ + // Stream died + if(stream.good() == false){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Stream died!" << std::endl; + return -1; + } + + // EOF + // tellg will always return a positive value here + // or it would've failed at good() check + if((U64)stream.tellg() == this->offset_end_of_data_) + return 0; + + // Read TGZF header + inflate_buffer.resize(sizeof(tgzf_header_type)); + stream.read(inflate_buffer.data(), IO::Constants::TGZF_BLOCK_HEADER_LENGTH); + const tgzf_header_type* h = reinterpret_cast(inflate_buffer.data()); + inflate_buffer.n_chars = IO::Constants::TGZF_BLOCK_HEADER_LENGTH; + if(!h->Validate()){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed to validate!" << std::endl; + return -2; + } + + inflate_buffer.resize(h->BSIZE); // make sure all data will fit + + // Recast because if compressed_buffer is (actually) resized then the pointer address is incorrect + // resulting in segfault + h = reinterpret_cast(inflate_buffer.data()); + + stream.read(&inflate_buffer.buffer[IO::Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - IO::Constants::TGZF_BLOCK_HEADER_LENGTH); + if(!stream.good()){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Truncated file..." << std::endl; + return -3; + } + + inflate_buffer.n_chars = h->BSIZE; + const U32 uncompressed_size = *reinterpret_cast(&inflate_buffer[inflate_buffer.size() - sizeof(U32)]); + + // Clear output compressed_buffer + if(clear) { + data_buffer.reset(); + data_buffer.resize(uncompressed_size); + } else { // Otherwise resize to permit data + data_buffer.resize(data_buffer.size() + uncompressed_size); + } + + if(!this->tgzf_controller_.Inflate(inflate_buffer, data_buffer)){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed inflate!" << std::endl; + return -4; + } + + if(data_buffer.size() == 0){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Empty data!" << std::endl; + return 0; + } + + // Reset compressed_buffer + inflate_buffer.reset(); + + // Reset iterator position and size + //this->iterator_position_block = 0; + //this->size = data_buffer.size() / sizeof(entry_type); + + // Validity check + if(data_buffer.size() % sizeof(entry_type) != 0){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Data is corrupted!" << std::endl; + return -5; + } + + return 1; +} + +OutputContainer TomahawkOutputReader::getContainerVariants(const U64 n_variants){ + size_t n_variants_loaded = 0; + this->data_.reset(); + this->data_.resize(n_variants*sizeof(entry_type) + 65536); // make room for data + while(true){ + if(!this->parseBlock(false)) + break; + + n_variants_loaded = this->data_.size() / sizeof(entry_type); + //std::cerr << n_variants_loaded << "/" << n_variants << '\t' << this->data_.size() << std::endl; + if(n_variants_loaded >= n_variants) + break; + } + + return(OutputContainer(this->data_)); +} + +OutputContainer TomahawkOutputReader::getContainerBytes(const size_t l_data){ + const U64 start_position = this->stream_.tellg(); + this->data_.reset(); + this->data_.resize(l_data + 65536); // make room for data + U64 data_loaded = 0; + while(true){ + if(!this->parseBlock(false)) + break; + + data_loaded = (U64)this->stream_.tellg() - start_position; + if(data_loaded >= l_data) + break; + + } + + return(OutputContainer(this->data_)); +} + +OutputContainer TomahawkOutputReader::getContainerBlocks(const U32 n_blocks){ + this->data_.reset(); + for(U32 i = 0; i < n_blocks; ++i){ + if(!this->parseBlock(false)) + break; + } + + return(OutputContainer(this->data_)); +} + +bool TomahawkOutputReader::seekBlock(const U32 blockID){ + if(blockID > this->getIndex().getContainer().size()){ + std::cerr << Helpers::timestamp("ERROR","TOI") << "Illegal blockID (" << blockID << ">" << this->getIndex().getContainer().size() << ")!" << std::endl; + return false; + } + + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR","TWO") << "Stream is bad!" << std::endl; + return false; + } + + this->stream_.seekg(this->getIndex().getContainer()[blockID].byte_offset); + if(!this->stream_.good()){ + std::cerr << Helpers::timestamp("ERROR","TWO") << "Stream is bad following seek!" << std::endl; + return false; + } + + return(true); +} + +bool TomahawkOutputReader::seekBlock(std::ifstream& stream, const U32 blockID) const{ + if(blockID > this->getIndex().getContainer().size()){ + std::cerr << Helpers::timestamp("ERROR","TOI") << "Illegal blockID (" << blockID << ">" << this->getIndex().getContainer().size() << ")!" << std::endl; + return false; + } + + if(!stream.good()){ + std::cerr << Helpers::timestamp("ERROR","TWO") << "Stream is bad!" << std::endl; + return false; + } + + stream.seekg(this->getIndex().getContainer()[blockID].byte_offset); + if(!stream.good()){ + std::cerr << Helpers::timestamp("ERROR","TWO") << "Stream is bad following seek!" << std::endl; + return false; + } + + return(true); +} + + +OutputContainerReference TomahawkOutputReader::getContainerReferenceBlock(const U32 blockID){ + if(!this->seekBlock(blockID)){ + this->parseBlock(); + return(OutputContainerReference()); + } else { + this->parseBlock(); + return(OutputContainerReference(this->data_)); + } +} + +OutputContainer TomahawkOutputReader::getContainerBlock(const U32 blockID){ + if(!this->seekBlock(blockID)){ + this->parseBlock(); + return(OutputContainer()); + } else { + this->parseBlock(); + return(OutputContainer(this->data_)); + } +} + +OutputContainerReference TomahawkOutputReader::getContainerReferenceBlock(std::vector blocks){ + if(!this->seekBlock(blocks[0])){ + this->parseBlock(); + return(OutputContainerReference()); + } else { + for(U32 i = 0; i < blocks.size(); ++i){ + if(!this->parseBlock(false)) + break; + } + return(OutputContainerReference(this->data_)); + } +} + +OutputContainer TomahawkOutputReader::getContainerBlock(std::vector blocks){ + if(!this->seekBlock(blocks[0])){ + this->parseBlock(); + return(OutputContainer()); + } else { + for(U32 i = 0; i < blocks.size(); ++i){ + if(!this->parseBlock(false)) + break; + } + return(OutputContainer(this->data_)); + } +} + +bool TomahawkOutputReader::addRegions(std::vector& positions){ + if(positions.size() == 0) + return true; + + if(this->interval_tree_entries == nullptr) + this->interval_tree_entries = new std::vector[this->getHeader().getMagic().getNumberContigs()]; + + if(this->interval_tree == nullptr){ + this->interval_tree = new tree_type*[this->getHeader().getMagic().getNumberContigs()]; + for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i) + this->interval_tree[i] = nullptr; + } + + if(!this->__addRegions(positions)) + return false; + + for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i){ + if(this->interval_tree_entries[i].size() != 0){ + this->interval_tree[i] = new tree_type(this->interval_tree_entries[i]); + } else + this->interval_tree[i] = nullptr; + } + + return true; +} + +bool TomahawkOutputReader::__addRegions(std::vector& positions){ + for(U32 i = 0; i < positions.size(); ++i){ + if(positions[i].find(',') != std::string::npos){ + std::vector ret = Helpers::split(positions[i], ','); + if(ret.size() == 1){ + std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Illegal interval: " << positions[i] << "!" << std::endl; + return false; + + } else if(ret.size() == 2){ + // parse left + interval_type intervalLeft; + if(this->__ParseRegionIndexed(ret[0], intervalLeft)) + this->interval_tree_entries[intervalLeft.contigID].push_back(interval_type(intervalLeft)); + + // parse right + interval_type intervalRight; + if(this->__ParseRegionIndexed(ret[1], intervalRight)) + this->interval_tree_entries[intervalRight.contigID].push_back(interval_type(intervalRight)); + + } else { + std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Illegal interval: " << positions[i] << "!" << std::endl; + return false; + } + } + // Has no comma in string + else { + interval_type interval; + if(this->__ParseRegionIndexed(positions[i], interval)) + this->interval_tree_entries[interval.contigID].push_back(interval_type(interval)); + } + } + + return true; +} + +bool TomahawkOutputReader::__ParseRegionIndexed(const std::string& region, interval_type& interval){ + std::vector ret = Helpers::split(region, ':'); + + // If vector does not contain a colon + if(ret.size() == 1){ + if(ret[0].find('-') != std::string::npos){ + std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Illegal interval: " << region << "!" << std::endl; + return false; + } + + // is contigID only + const S32 contigID = this->getHeader().getContigID(ret[0]); + if(contigID < 0){ + std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Contig: " << region << " is not defined in the header!" << std::endl; + return false; + } + + interval(contigID, 0, this->getHeader().contigs_[contigID].n_bases); + interval.state = interval_type::INTERVAL_TYPE::INTERVAL_CONTIG_ONLY; } - //delete interval_tree; - delete [] interval_tree_entries; - delete interval_tree; - this->buffer.deleteAll(); - this->output_buffer.deleteAll(); - delete this->writer; - delete this->interval_totempole_enties; + // If vector contain colon + else if(ret.size() == 2){ + // is contigID:pos-pos + const S32 contigID = this->getHeader().getContigID(ret[0]); + if(contigID < 0){ + std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Contig: " << ret[0] << " is not defined in the header!" << std::endl; + return false; + } + + std::vector retPos = Helpers::split(ret[1], '-'); + if(retPos.size() == 1){ + // only one pos + const double pos = std::stod(retPos[0]); + //std::cerr << "single position: " << pos << std::endl; + interval(contigID, pos, pos); + interval.state = interval_type::INTERVAL_TYPE::INTERVAL_POSITION; + + } else if(retPos.size() == 2){ + // is two positions + double posA = std::stod(retPos[0]); + double posB = std::stod(retPos[1]); + + // Swap pA and pB iff pB > pA + if(posB < posA) + std::swap(posA, posB); + + interval(contigID, posA, posB); + interval.state = interval_type::INTERVAL_TYPE::INTERVAL_FULL; + + } else { + std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Illegal interval: " << region << "!" << std::endl; + return false; + } + } + // contains > 1 colons + // illegal + else { + std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Illegal interval: " << region << "!" << std::endl; + return false; + } + + return true; } -bool TomahawkOutputReader::view(const std::string& input){ +bool TomahawkOutputReader::view(void){ if(this->interval_tree != nullptr) // If regions have been set: use region-filter function return(this->__viewRegion()); - else if(this->filter.any_filter_user_set){ + else if(this->filters_.any_filter_user_set) return(this->__viewFilter()); // Otherwise normal filter function - } else + else return(this->__viewOnly()); } +bool TomahawkOutputReader::__viewOnly(void){ + //std::cerr << Helpers::timestamp("LOG") << "Sorted: " << (int)this->getIndex().getController().isSorted << " partial: " << (int)this->getIndex().getController().isPartialSorted << std::endl; + this->getHeader().getLiterals() += "\n##tomahawk_viewCommand=" + Helpers::program_string(); + this->getHeader().getLiterals() += "\n##tomahawk_viewFilters=" + this->filters_.getInterpretedString() + " filter=NO regions=NO"; + + //if(!this->OpenWriter()) + // return false; + + if(this->showHeader_ == true){ + std::cout << this->getHeader().getLiterals() << '\n'; + } + + // Natural output required parsing + size_t n_total = 0; + //if(this->writer_output_type == WRITER_TYPE::natural){ + while(this->parseBlock()){ + OutputContainerReference o = this->getContainerReference(); + n_total += o.size(); + for(U32 i = 0; i < o.size(); ++i){ + o[i].write(std::cout, this->getHeader().contigs_); + } + } + //std::cerr << "total: " << n_total << std::endl; + //} + // Binary output without filtering simply writes it back out +/* + else if(this->writer_output_type == WRITER_TYPE::binary){ + while(this->parseBlock()){ + OutputContainerReference o(this->compressed_buffer); + //this->writer->write(this->data_); + std::cout << o[0] << std::endl; + } + } +*/ + return true; +} + +bool TomahawkOutputReader::__viewRegion(void){ + this->getHeader().getLiterals() += "\n##tomahawk_viewCommand=" + Helpers::program_string(); + if(this->filters_.any_filter_user_set) + this->getHeader().getLiterals() += "\n##tomahawk_viewFilters=" + this->filters_.getInterpretedString() + " filter=YES regions=YES"; + + if(this->showHeader_ == true){ + std::cout << this->getHeader().getLiterals() << '\n'; + } + + if(this->interval_tree != nullptr){ + while(this->parseBlock()){ + output_container_reference_type o(this->data_); + for(U32 i = 0; i < o.size(); ++i) + this->__checkRegionNoIndex(o[i]); + } // end while next block + } + + return true; +} + +bool TomahawkOutputReader::__viewFilter(void){ + this->getHeader().getLiterals() += "\n##tomahawk_viewCommand=" + Helpers::program_string(); + this->getHeader().getLiterals() += "\n##tomahawk_viewFilters=" + this->filters_.getInterpretedString() + " filter=YES regions=NO"; + + if(this->showHeader_ == true) + std::cout << this->getHeader().getLiterals() << '\n'; + + while(this->parseBlock()){ + output_container_reference_type o(this->data_); + for(U32 i = 0; i < o.size(); ++i){ + if(this->filters_.filter(o[i])){ + o[i].write(std::cout, this->getHeader().contigs_); + } + } + } // end while next block + return true; +} + +bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type& entry){ + // If iTree for contigA exists + if(this->interval_tree[entry.AcontigID] != nullptr){ + std::vector rets = this->interval_tree[entry.AcontigID]->findOverlapping(entry.Aposition, entry.Aposition); + if(rets.size() > 0){ + for(U32 i = 0; i < rets.size(); ++i){ + if(rets[i].value != nullptr){ // if linked + if((entry.BcontigID == rets[i].value->contigID) && + (entry.Bposition >= rets[i].value->start && + entry.Bposition <= rets[i].value->stop)){ + if(this->filters_.filter(entry)){ + //entry.write(std::cout, this->contigs); + //*this->writer << entry; + entry.write(std::cout, this->getHeader().contigs_); + } + + return true; + } // end match + } else { // not linked + if(this->filters_.filter(entry)){ + //entry.write(std::cout, this->contigs); + //*this->writer << entry; + entry.write(std::cout, this->getHeader().contigs_); + } + + return true; + } + } + } + } + + // If iTree for contigB exists + if(this->interval_tree[entry.BcontigID] != nullptr){ + std::vector rets = this->interval_tree[entry.BcontigID]->findOverlapping(entry.Bposition, entry.Bposition); + if(rets.size() > 0){ + for(U32 i = 0; i < rets.size(); ++i){ + if(rets[i].value != nullptr){ // if linked + if((entry.AcontigID == rets[i].value->contigID) && + (entry.Aposition >= rets[i].value->start && + entry.Aposition <= rets[i].value->stop)){ + if(this->filters_.filter(entry)){ + //entry.write(std::cout, this->contigs); + //*this->writer << entry; + entry.write(std::cout, this->getHeader().contigs_); + } + return true; + } // end match + } else { // not linked + if(this->filters_.filter(entry)){ + //entry.write(std::cout, this->contigs); + //*this->writer << entry; + entry.write(std::cout, this->getHeader().contigs_); + } + + return true; + } + } + } // end if any hit in iTree b + } // end iTree b + + return false; +} + +/* bool TomahawkOutputReader::OpenWriter(void){ if(this->writer_output_type == WRITER_TYPE::natural){ - this->writer = new TomahawkOutputWriterNatural(this->contigs, &this->header); + this->writer = new OutputWriterNatural(this->contigs, &this->header); } - else this->writer = new TomahawkOutputWriter(this->contigs, &this->header); + else this->writer = new OutputWriter(this->contigs, &this->header); if(!this->writer->open()) return false; @@ -70,9 +673,9 @@ bool TomahawkOutputReader::OpenWriter(void){ bool TomahawkOutputReader::OpenWriter(const std::string output_file){ if(this->writer_output_type == WRITER_TYPE::natural){ - this->writer = new TomahawkOutputWriterNatural(this->contigs, &this->header); + this->writer = new OutputWriterNatural(this->contigs, &this->header); } - else this->writer = new TomahawkOutputWriter(this->contigs, &this->header); + else this->writer = new OutputWriter(this->contigs, &this->header); if(!this->writer->open(output_file)){ std::cerr << Helpers::timestamp("ERROR","WRITER") << "Failed to open output file: " << output_file << std::endl; @@ -102,15 +705,20 @@ bool TomahawkOutputReader::__viewRegion(void){ if(this->interval_tree != nullptr){ const entry_type* entry = nullptr; - while(this->nextVariant(entry)){ - this->__checkRegionNoIndex(entry); + + while(this->parseBlock()){ + output_container_reference_type o(this->data_); + for(U32 i = 0; i < o.size(); ++i) + this->__checkRegionNoIndex(o[i]); } // end while next variant } return true; } +*/ bool TomahawkOutputReader::__viewRegionIndexed(void){ + /* if(this->interval_tree == nullptr){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Interval tree not set!" << std::endl; return false; @@ -134,11 +742,12 @@ bool TomahawkOutputReader::__viewRegionIndexed(void){ // 1 entry if(block_length == 0){ - if(!this->getBlock(entry.fromBlock)){ + if(!this->seekBlock(entry.fromBlock)){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Could not get block" << std::endl; return false; } - this->position = entry.fromBlock_entries_offset; + this->iterator_position_block = entry.fromBlock_entries_offset; + while(this->nextVariantLimited(two_entry)){ this->__checkRegionIndex(two_entry); @@ -147,18 +756,18 @@ bool TomahawkOutputReader::__viewRegionIndexed(void){ // 2 entries else if(block_length == 1){ // First one - if(!this->getBlock(entry.fromBlock)){ + if(!this->seekBlock(entry.fromBlock)){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Could not get block" << std::endl; return false; } - this->position = entry.fromBlock_entries_offset; + this->iterator_position_block = entry.fromBlock_entries_offset; while(this->nextVariantLimited(two_entry)){ this->__checkRegionIndex(two_entry); } // end while next variant // Second one - if(!this->getBlock(entry.toBlock)){ + if(!this->seekBlock(entry.toBlock)){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Could not get block" << std::endl; return false; } @@ -173,11 +782,11 @@ bool TomahawkOutputReader::__viewRegionIndexed(void){ else { // First block U32 j = entry.fromBlock; - if(!this->getBlock(j)){ + if(!this->seekBlock(j)){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Could not get block" << std::endl; return false; } - this->position = entry.fromBlock_entries_offset; + this->iterator_position_block = entry.fromBlock_entries_offset; while(this->nextVariantLimited(two_entry)){ this->__checkRegionIndex(two_entry); @@ -186,7 +795,7 @@ bool TomahawkOutputReader::__viewRegionIndexed(void){ // Middle blocks for(; j < entry.toBlock - 1; ++j){ - if(!this->getBlock(j)){ + if(!this->seekBlock(j)){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Could not get block" << std::endl; return false; } @@ -197,7 +806,7 @@ bool TomahawkOutputReader::__viewRegionIndexed(void){ } // last block - if(!this->getBlock(j)){ + if(!this->seekBlock(j)){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Could not get block" << std::endl; return false; } @@ -209,44 +818,45 @@ bool TomahawkOutputReader::__viewRegionIndexed(void){ } } + */ return true; } -bool TomahawkOutputReader::__checkRegionIndex(const entry_type* const entry){ +/* +bool TomahawkOutputReader::__checkRegionIndex(const entry_type& entry){ // If iTree for contigA exists - if(this->interval_tree[entry->AcontigID] != nullptr){ - std::vector rets = this->interval_tree[entry->AcontigID]->findOverlapping(entry->Aposition, entry->Aposition); + if(this->interval_tree[entry.AcontigID] != nullptr){ + std::vector rets = this->interval_tree[entry.AcontigID]->findOverlapping(entry.Aposition, entry.Aposition); if(rets.size() > 0){ for(U32 i = 0; i < rets.size(); ++i){ - if(this->filter.filter(*entry)) + if(this->filter.filter(entry)) *this->writer << entry; return true; } } } - return false; } -bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type* const entry){ +bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type& entry){ // If iTree for contigA exists - if(this->interval_tree[entry->AcontigID] != nullptr){ - std::vector rets = this->interval_tree[entry->AcontigID]->findOverlapping(entry->Aposition, entry->Aposition); + if(this->interval_tree[entry.AcontigID] != nullptr){ + std::vector rets = this->interval_tree[entry.AcontigID]->findOverlapping(entry.Aposition, entry.Aposition); if(rets.size() > 0){ for(U32 i = 0; i < rets.size(); ++i){ if(rets[i].value != nullptr){ // if linked - if((entry->BcontigID == rets[i].value->contigID) && - (entry->Bposition >= rets[i].value->start && entry->Bposition <= rets[i].value->stop)){ - if(this->filter.filter(*entry)) - //entry->write(std::cout, this->contigs); + if((entry.BcontigID == rets[i].value->contigID) && + (entry.Bposition >= rets[i].value->start && entry.Bposition <= rets[i].value->stop)){ + if(this->filter.filter(entry)) + //entry.write(std::cout, this->contigs); *this->writer << entry; return true; } // end match } else { // not linked - if(this->filter.filter(*entry)) - //entry->write(std::cout, this->contigs); + if(this->filter.filter(entry)) + //entry.write(std::cout, this->contigs); *this->writer << entry; return true; @@ -256,22 +866,22 @@ bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type* const entry){ } // If iTree for contigB exists - if(this->interval_tree[entry->BcontigID] != nullptr){ - std::vector rets = this->interval_tree[entry->BcontigID]->findOverlapping(entry->Bposition, entry->Bposition); + if(this->interval_tree[entry.BcontigID] != nullptr){ + std::vector rets = this->interval_tree[entry.BcontigID]->findOverlapping(entry.Bposition, entry.Bposition); if(rets.size() > 0){ for(U32 i = 0; i < rets.size(); ++i){ if(rets[i].value != nullptr){ // if linked - if((entry->AcontigID == rets[i].value->contigID) && - (entry->Aposition >= rets[i].value->start && entry->Aposition <= rets[i].value->stop)){ - if(this->filter.filter(*entry)){ - //entry->write(std::cout, this->contigs); + if((entry.AcontigID == rets[i].value->contigID) && + (entry.Aposition >= rets[i].value->start && entry.Aposition <= rets[i].value->stop)){ + if(this->filter.filter(entry)){ + //entry.write(std::cout, this->contigs); *this->writer << entry; } return true; } // end match } else { // not linked - if(this->filter.filter(*entry)) - //entry->write(std::cout, this->contigs); + if(this->filter.filter(entry)) + //entry.write(std::cout, this->contigs); *this->writer << entry; return true; @@ -291,16 +901,24 @@ bool TomahawkOutputReader::__viewOnly(void){ return false; // Natural output required parsing + size_t n_total = 0; if(this->writer_output_type == WRITER_TYPE::natural){ - const entry_type* entry = nullptr; - while(this->nextVariant(entry)) - *this->writer << entry; - + while(this->parseBlock()){ + OutputContainerReference o = this->getContainerReference(); + std::cerr << o.size() << '\t' << this->data_.size() << std::endl; + n_total += o.size(); + for(U32 i = 0; i < o.size(); ++i) + std::cout << o[i] << '\n'; + } + std::cerr << "total: " << n_total << std::endl; } // Binary output without filtering simply writes it back out else if(this->writer_output_type == WRITER_TYPE::binary){ - while(this->nextBlock()) - this->writer->write(this->output_buffer); + while(this->parseBlock()){ + OutputContainerReference o(this->compressed_buffer); + //this->writer->write(this->data_); + std::cout << o[0] << std::endl; + } } return true; @@ -313,16 +931,17 @@ bool TomahawkOutputReader::__viewFilter(void){ if(!this->OpenWriter()) return false; - const entry_type* entry = nullptr; - while(this->nextVariant(entry)){ - if(this->filter.filter(*entry)) - *this->writer << entry; - } - + while(this->parseBlock()){ + output_container_reference_type o(this->data_); + for(U32 i = 0; i < o.size(); ++i){ + if(this->filter.filter(o[i])) + *this->writer << o[i]; + } + } // end while next variant return true; } -bool TomahawkOutputReader::AddRegionsIndexed(std::vector& positions){ +bool TomahawkOutputReader::addRegionsIndexed(std::vector& positions){ for(U32 i = 0; i < positions.size(); ++i){ if(positions[i].find(',') != std::string::npos){ std::vector ret = Helpers::split(positions[i], ','); @@ -357,7 +976,7 @@ bool TomahawkOutputReader::AddRegionsIndexed(std::vector& positions return true; } -bool TomahawkOutputReader::AddRegionsUnindexed(std::vector& positions){ +bool TomahawkOutputReader::addRegionsUnindexed(std::vector& positions){ for(U32 i = 0; i < positions.size(); ++i){ // Pattern cA:pAf-pAt;cB:pBf-pBt if(positions[i].find(',') != std::string::npos){ @@ -406,7 +1025,7 @@ bool TomahawkOutputReader::AddRegionsUnindexed(std::vector& positio return true; } -bool TomahawkOutputReader::AddRegions(std::vector& positions){ +bool TomahawkOutputReader::addRegions(std::vector& positions){ if(positions.size() == 0) return true; @@ -420,10 +1039,10 @@ bool TomahawkOutputReader::AddRegions(std::vector& positions){ } if(this->toi_reader.ERROR_STATE == toi_reader_type::TOI_OK && (this->toi_reader.getIsSortedExpanded())){ - if(!this->AddRegionsIndexed(positions)) + if(!this->addRegionsIndexed(positions)) return false; } else { - if(!this->AddRegionsUnindexed(positions)) + if(!this->addRegionsUnindexed(positions)) return false; } @@ -451,7 +1070,7 @@ bool TomahawkOutputReader::__ParseRegion(const std::string& region, interval_typ std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Contig: " << region << " is not defined in the header!" << std::endl; return false; } - interval(*contigID, 0, this->contigs[*contigID].bases); + interval(*contigID, 0, this->contigs[*contigID].n_bases); } else if(ret.size() == 2){ // is contigID:pos-pos @@ -505,7 +1124,7 @@ bool TomahawkOutputReader::__ParseRegionIndexed(const std::string& region, inter std::cerr << Helpers::timestamp("ERROR", "INTERVAL") << "Contig: " << region << " is not defined in the header!" << std::endl; return false; } - interval(*contigID, 0, this->contigs[*contigID].bases); + interval(*contigID, 0, this->contigs[*contigID].n_bases); interval.state = interval_type::INTERVAL_TYPE::INTERVAL_CONTIG_ONLY; } // If vector contain colon @@ -605,21 +1224,21 @@ bool TomahawkOutputReader::__ParseRegionIndexedBlocks(void){ } bool TomahawkOutputReader::__Open(const std::string input){ - this->stream.open(input, std::ios::binary | std::ios::in | std::ios::ate); - if(!this->stream.good()){ + this->stream_.open(input, std::ios::binary | std::ios::in | std::ios::ate); + if(!this->stream_.good()){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed to open file: " << input << std::endl; return false; } - this->filesize = this->stream.tellg(); - this->stream.seekg(0); + this->filesize = this->stream_.tellg(); + this->stream_.seekg(0); - if(!this->stream.good()){ + if(!this->stream_.good()){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Bad stream!" << std::endl; return false; } - this->stream >> this->header; + this->stream_ >> this->header; if(!this->header.validate(Tomahawk::Constants::WRITE_HEADER_LD_MAGIC)){ std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed to validate header!" << std::endl; return false; @@ -683,22 +1302,22 @@ bool TomahawkOutputReader::__concat(const std::vector& files, const return false; } - while(this->nextBlock()){ - this->writer->write(this->output_buffer); + while(this->parseBlock()){ + this->writer->write(this->data_); } for(U32 i = 1; i < files.size(); ++i){ if(!SILENT) std::cerr << Helpers::timestamp("LOG", "CONCAT") << "Opening input: " << files[i] << "..." << std::endl; - this->stream.close(); + this->stream_.close(); if(!this->OpenExtend(files[i])){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Failed to parse: " << files[i] << "..." << std::endl; return false; } - while(this->nextBlock()){ - this->writer->write(this->output_buffer); + while(this->parseBlock()){ + this->writer->write(this->data_); } } @@ -754,7 +1373,7 @@ bool TomahawkOutputReader::ParseHeader(void){ U32* ret; for(U32 i = 0; i < this->header.n_contig; ++i){ - this->stream >> this->contigs[i]; + this->stream_ >> this->contigs[i]; if(!this->contig_htable->GetItem(&this->contigs[i].name[0], &this->contigs[i].name, ret, this->contigs[i].name.size())){ // Add to hash table this->contig_htable->SetItem(&this->contigs[i].name[0], &this->contigs[i].name, i, this->contigs[i].name.size()); @@ -764,12 +1383,12 @@ bool TomahawkOutputReader::ParseHeader(void){ } } - if(!this->gzip_controller.InflateBlock(this->stream, this->buffer)){ + if(!this->tgzf_controller.InflateBlock(this->stream_, this->compressed_buffer)){ std::cerr << Helpers::timestamp("ERROR","TGZF") << "Failed to get TWO block" << std::endl; return false; } - this->literals = std::string(this->gzip_controller.buffer.data); + this->literals = std::string(this->tgzf_controller.buffer.data()); return true; } @@ -780,7 +1399,7 @@ bool TomahawkOutputReader::ParseHeaderExtend(void){ U32* ret; for(U32 i = 0; i < this->header.n_contig; ++i){ - this->stream >> this->contigs[i]; + this->stream_ >> this->contigs[i]; // std::cerr << this->contigs[i] << std::endl; if(!this->contig_htable->GetItem(&this->contigs[i].name[0], &this->contigs[i].name, ret, this->contigs[i].name.size())){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Contig does not exist in other file" << std::endl; @@ -788,7 +1407,7 @@ bool TomahawkOutputReader::ParseHeaderExtend(void){ } } - if(!this->gzip_controller.InflateBlock(this->stream, this->buffer)){ + if(!this->tgzf_controller.InflateBlock(this->stream_, this->compressed_buffer)){ std::cerr << Helpers::timestamp("ERROR","TGZF") << "Failed to get TWO block" << std::endl; return false; } @@ -796,7 +1415,7 @@ bool TomahawkOutputReader::ParseHeaderExtend(void){ return true; } -bool TomahawkOutputReader::getBlock(const U32 blockID){ +bool TomahawkOutputReader::seekBlock(const U32 blockID){ if(this->toi_reader.ERROR_STATE != toi_reader_type::TOI_OK){ std::cerr << Helpers::timestamp("ERROR","TOI") << "Index is bad!" << std::endl; return false; @@ -807,261 +1426,29 @@ bool TomahawkOutputReader::getBlock(const U32 blockID){ return false; } - if(!this->stream.good()){ + if(!this->stream_.good()){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Stream is bad!" << std::endl; return false; } - this->stream.seekg(this->toi_reader[blockID].byte_offset); - if(!this->stream.good()){ + this->stream_.seekg(this->toi_reader[blockID].getStartOffset()); + if(!this->stream_.good()){ std::cerr << Helpers::timestamp("ERROR","TWO") << "Stream is bad following seek!" << std::endl; return false; } - return(this->nextBlock()); -} - -bool TomahawkOutputReader::nextBlock(const bool clear){ - // Stream died - if(!this->stream.good()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Stream died!" << std::endl; - return false; - } - - // EOF - if(this->stream.tellg() == this->filesize) - return false; - - buffer.resize(sizeof(tgzf_type)); - this->stream.read(&buffer.data[0], Constants::TGZF_BLOCK_HEADER_LENGTH); - const tgzf_type* h = reinterpret_cast(&buffer.data[0]); - buffer.pointer = Constants::TGZF_BLOCK_HEADER_LENGTH; - if(!h->Validate()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed to validate!" << std::endl; - return false; - } - - buffer.resize(h->BSIZE); // make sure all data will fit - - // Recast because if buffer is (actually) resized then the pointer address is incorrect - // resulting in segfault - h = reinterpret_cast(&buffer.data[0]); - - this->stream.read(&buffer.data[Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - Constants::TGZF_BLOCK_HEADER_LENGTH); - if(!this->stream.good()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Truncated file..." << std::endl; - return false; - } - - buffer.pointer = h->BSIZE; - const U32 uncompressed_size = *reinterpret_cast(&buffer[buffer.pointer - sizeof(U32)]); - output_buffer.resize(uncompressed_size); - - // Clear output buffer - if(clear) - this->output_buffer.reset(); - - if(!this->gzip_controller.Inflate(buffer, output_buffer)){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed inflate!" << std::endl; - return false; - } - - if(this->output_buffer.size() == 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Empty data!" << std::endl; - return false; - } - - // Reset buffer - this->buffer.reset(); - - // Reset iterator position and size - this->position = 0; - this->size = this->output_buffer.size() / sizeof(entry_type); - - // Validity check - if(this->output_buffer.size() % sizeof(entry_type) != 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Data is corrupted!" << std::endl; - return false; - } - - return true; -} - -bool TomahawkOutputReader::nextBlockUntil(const U32 limit){ - // Check if resize required - if(this->output_buffer.capacity() < limit + 65536) - this->output_buffer.resize(limit + 65536); - - this->position = 0; - this->output_buffer.reset(); - - // Keep inflating DATA until bounds is reached - while(this->output_buffer.size() <= limit){ - // Stream died - if(!this->stream.good()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Stream died!" << std::endl; - return false; - } - - // EOF - // Casting stream to U64 is safe as this point is not - // reached if above good() return fails - if((U64)this->stream.tellg() == this->filesize){ - //std::cerr << "eof" << std::endl; - return false; - } - - buffer.resize(sizeof(tgzf_type)); - this->stream.read(&buffer.data[0], Constants::TGZF_BLOCK_HEADER_LENGTH); - const tgzf_type* h = reinterpret_cast(&buffer.data[0]); - buffer.pointer = Constants::TGZF_BLOCK_HEADER_LENGTH; - if(!h->Validate()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed to validate header!" << std::endl; - return false; - } - - buffer.resize(h->BSIZE); // make sure all data will fit - - // Recast because if buffer is resized then the pointer address is incorrect - // resulting in segfault - h = reinterpret_cast(&buffer.data[0]); - - this->stream.read(&buffer.data[Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - Constants::TGZF_BLOCK_HEADER_LENGTH); - if(!this->stream.good()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Truncated file..." << std::endl; - return false; - } - - buffer.pointer = h->BSIZE; - - if(!this->gzip_controller.Inflate(buffer, output_buffer)){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed inflate!" << std::endl; - return false; - } - - if(this->output_buffer.size() == 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Empty data!" << std::endl; - return false; - } - - // Reset buffer - this->buffer.reset(); - - // Reset iterator position and size - this->size = this->output_buffer.size() / sizeof(entry_type); - - // Validity check - if(this->output_buffer.size() % sizeof(entry_type) != 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Data is corrupted!" << std::endl; - return false; - } - } - return true; -} - -bool TomahawkOutputReader::nextBlockUntil(const U32 limit, const U64 virtual_offset){ - // Check if resize required - if(this->output_buffer.capacity() < limit + 65536) - this->output_buffer.resize(limit + 65536); - - this->position = 0; - this->output_buffer.reset(); - - // Keep inflating DATA until bounds is reached - while(this->output_buffer.size() <= limit && this->stream.tellg() != (U64)virtual_offset){ - // Stream died - if(!this->stream.good()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Stream died!" << std::endl; - return false; - } - - // EOF - // Casting stream to U64 is safe as this point is not - // reached if above good() return fails - if((U64)this->stream.tellg() == this->filesize){ - //std::cerr << "eof" << std::endl; - return false; - } - - buffer.resize(sizeof(tgzf_type)); - this->stream.read(&buffer.data[0], Constants::TGZF_BLOCK_HEADER_LENGTH); - const tgzf_type* h = reinterpret_cast(&buffer.data[0]); - buffer.pointer = Constants::TGZF_BLOCK_HEADER_LENGTH; - if(!h->Validate()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed to validate header!" << std::endl; - return false; - } - - buffer.resize(h->BSIZE); // make sure all data will fit - - // Recast because if buffer is resized then the pointer address is incorrect - // resulting in segfault - h = reinterpret_cast(&buffer.data[0]); - - this->stream.read(&buffer.data[Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - Constants::TGZF_BLOCK_HEADER_LENGTH); - if(!this->stream.good()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Truncated file..." << std::endl; - return false; - } - - buffer.pointer = h->BSIZE; - - if(!this->gzip_controller.Inflate(buffer, output_buffer)){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Failed inflate!" << std::endl; - return false; - } - - if(this->output_buffer.size() == 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Empty data!" << std::endl; - return false; - } - - // Reset buffer - this->buffer.reset(); - - // Reset iterator position and size - this->size = this->output_buffer.size() / sizeof(entry_type); - - // Validity check - if(this->output_buffer.size() % sizeof(entry_type) != 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TWO") << "Data is corrupted!" << std::endl; - return false; - } - } - return true; -} - -bool TomahawkOutputReader::nextVariant(const entry_type*& entry){ - if(this->position == this->size){ - if(!this->nextBlock()) - return false; - } - - entry = (*this)[this->position]; - ++this->position; - - return true; -} - -// Do NOT get a new variant when reaching end of data -bool TomahawkOutputReader::nextVariantLimited(const entry_type*& entry){ - if(this->position == this->size){ - return false; - } - - entry = (*this)[this->position]; - ++this->position; - - return true; + return(this->parseBlock()); } bool TomahawkOutputReader::summary(const std::string& input, const U32 bins){ TWO::TomahawkOutputStatsContainer container(bins); // Natural output required parsing - const entry_type* entry = nullptr; - while(this->nextVariant(entry)) - container += *entry; + while(this->parseBlock()){ + output_container_reference_type o(this->data_); + for(U32 i = 0; i < o.size(); ++i) + container += o[i]; + } std::cerr << "R2\t" << container.R2.within.getTotal() << '\t' << container.R2.across.getTotal() << '\t' << container.R2.global.getTotal() << std::endl; std::cerr << container.R2 << std::endl; @@ -1112,7 +1499,6 @@ bool TomahawkOutputReader::setWriterType(const int type){ } return true; } +*/ - -} } /* namespace Tomahawk */ diff --git a/src/tomahawk/two/TomahawkOutputReader.h b/src/tomahawk/two/TomahawkOutputReader.h new file mode 100644 index 0000000..5b791cf --- /dev/null +++ b/src/tomahawk/two/TomahawkOutputReader.h @@ -0,0 +1,180 @@ +#ifndef TOMAHAWKOUTPUTREADER_H_ +#define TOMAHAWKOUTPUTREADER_H_ + +#include +#include +#include +#include +#include + +#include "../../io/BasicBuffer.h" +#include "../../io/compression/TGZFController.h" +#include "../../support/MagicConstants.h" +#include "../../algorithm/open_hashtable.h" +#include "../../support/type_definitions.h" +#include "../../third_party/intervalTree.h" +#include "../../tomahawk/output_container.h" +#include "../../tomahawk/output_container_reference.h" +#include "../two/output_entry.h" +#include "output_filter.h" +#include "../../index/index.h" +#include "../../index/footer.h" +#include "../../index/tomahawk_header.h" + +namespace Tomahawk { + +class TomahawkOutputReader { +private: + typedef IO::OutputEntry entry_type; + typedef OutputFilter filter_type; + typedef OutputContainer output_container_type; + typedef OutputContainerReference output_container_reference_type; + typedef Totempole::HeaderContig contig_type; + typedef IO::TGZFHeader tgzf_header_type; + typedef Algorithm::ContigInterval interval_type; + typedef TomahawkHeader header_type; + typedef Index index_type; + typedef IO::BasicBuffer buffer_type; + typedef IO::TGZFController tgzf_controller_type; + typedef Totempole::Footer footer_type; + + typedef Algorithm::IntervalTree tree_type; + typedef Hash::HashTable hash_table; + +public: + TomahawkOutputReader(); + ~TomahawkOutputReader(); + + // Accessors + inline footer_type& getFooter(void){ return(this->footer_); } + inline const footer_type& getFooter(void) const{ return(this->footer_); } + inline const index_type& getIndex(void) const{ return(*this->index_); } + inline index_type& getIndex(void){ return(*this->index_); } + inline const header_type& getHeader(void) const{ return(this->header_); } + inline header_type& getHeader(void){ return(this->header_); } + inline index_type* getIndexPointer(void){ return(this->index_); } + + bool open(const std::string input); + bool addRegions(std::vector& positions); + //bool OpenExtend(const std::string input); + + // Streaming functions + /**< + * Seek to block at a given position and load that + * data into memory + * @param position Target block position + * @return Returns TRUE upon success or FALSE otherwise + */ + bool seekBlock(const U32 position); + + /**< + * Used in parallel programming: + * Takes an input file stream and seeks to a given position and load that + * data into memory without modifying the host container + * @param stream Input file stream + * @param position Target block position + * @return Returns TRUE upon success or FALSE otherwise + */ + bool seekBlock(std::ifstream& stream, const U32 position) const; + + /**< + * Parses TWO data that has been loaded into memory after invoking + * either getBlock functions. This function also increments the internal + * position of the file handler. + * @param clear Boolean set to TRUE if raw data should be cleared after invoking this function + * @return Returns TRUE upon success or FALSE otherwis + */ + int parseBlock(const bool clear = true); + + /**< + * Used in parallel programming: + * Takes an input file stream and the necessary buffers and inflates `two` data without + * modifying the host container + * @param stream Input file stream + * @param inflate_buffer Support buffer for loading compressed data + * @param data_buffer Output buffer for inflated `two` entries + * @param compression_manager Compression manager + * @param clear Boolean set to TRUE if raw data should be cleared after invoking this function + * @return Returns TRUE upon success or FALSE otherwise + */ + int parseBlock(std::ifstream& stream, buffer_type& inflate_buffer, buffer_type& data_buffer, tgzf_controller_type& compression_manager, const bool clear = true) const; + + // Access: no random access. All these functions + // assumes that data is loaded linearly from disk + inline output_container_type getContainer(void){ return(output_container_type(this->data_)); } + output_container_type getContainerVariants(const U64 n_variants); + output_container_type getContainerBytes(const size_t l_data); + output_container_type getContainerBlocks(const U32 n_blocks); + inline output_container_reference_type getContainerReference(void){ return(output_container_reference_type(this->data_)); } + + // Access: requires complete (if n = 1) or partial (if n > 1) random access + output_container_reference_type getContainerReferenceBlock(const U32 blockID); + output_container_reference_type getContainerReferenceBlock(std::vector blocks); + output_container_type getContainerBlock(const U32 blockID); + output_container_type getContainerBlock(std::vector blocks); + + inline const bool isSorted(void) const{ return(this->index_->getController().isSorted == true); } + + // Basic operations + bool view(void); + bool view(const interval_type& interval); + bool view(const std::vector& intervals); + + // Other + bool index(const std::string& filename); + bool summary(const std::string& input, const U32 bins); + + // Concatenate + bool concat(const std::string& file_list, const std::string& output); + bool concat(const std::vector& files, const std::string& output); + + // + bool setWriterType(const int type); + void setWriteHeader(const bool write){ this->showHeader_ = write; } + + inline filter_type& getFilter(void){ return this->filters_; } + bool OpenWriter(void); + bool OpenWriter(const std::string output_file); + +private: + bool __Open(const std::string input); + bool ParseHeader(void); + bool ParseHeaderExtend(void); + + bool __viewOnly(void); + bool __viewFilter(void); + bool __viewRegion(void); + bool __viewRegionIndexed(void); + bool __checkRegionIndex(const entry_type& entry); + bool __checkRegionNoIndex(const entry_type& entry); + bool __concat(const std::vector& files, const std::string& output); + + bool __addRegions(std::vector& positions); + bool __ParseRegion(const std::string& region, interval_type& interval); + bool __ParseRegionIndexed(const std::string& region, interval_type& interval); + bool __ParseRegionIndexedBlocks(void); + +public: + U64 filesize_; // filesize + U64 offset_end_of_data_; + bool showHeader_; // flag to output header or not + std::ifstream stream_; // reader stream + + header_type header_; + footer_type footer_; + index_type* index_; + + buffer_type buffer_; // input buffer + buffer_type data_; // inflate buffer + buffer_type outputBuffer_; // output buffer + tgzf_controller_type tgzf_controller_; // compression controller + + filter_type filters_; // filter parameters + + tree_type** interval_tree; + std::vector* interval_tree_entries; +}; + +} /* namespace Tomahawk */ + +#endif /* TOMAHAWKOUTPUTREADER_H_ */ diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputStats.h b/src/tomahawk/two/TomahawkOutputStats.h similarity index 98% rename from src/tomahawk/TomahawkOutput/TomahawkOutputStats.h rename to src/tomahawk/two/TomahawkOutputStats.h index 10c0fce..a916aac 100644 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputStats.h +++ b/src/tomahawk/two/TomahawkOutputStats.h @@ -83,7 +83,7 @@ struct TomahawkOutputStats{ struct TomahawkOutputStatsContainer{ typedef TomahawkOutputStats stats_type; - typedef IO::TomahawkOutputEntry entry_type; + typedef IO::OutputEntry entry_type; TomahawkOutputStatsContainer(const U32& bins) : n_bins(bins), R2(this->n_bins), D(this->n_bins), Dprime(this->n_bins){} diff --git a/src/tomahawk/two/output_entry.cpp b/src/tomahawk/two/output_entry.cpp new file mode 100644 index 0000000..30cea4a --- /dev/null +++ b/src/tomahawk/two/output_entry.cpp @@ -0,0 +1,73 @@ +#include "output_entry.h" + +namespace Tomahawk{ +namespace IO{ + +// These memcpy works because the struct is aligned without padding +OutputEntry::OutputEntry(const char* const data_buffer){ + memcpy(this, data_buffer, sizeof(self_type)); +} + +// Copy data from stream +OutputEntry::OutputEntry(const IO::BasicBuffer& data_buffer){ + memcpy(this, data_buffer.data(), sizeof(self_type)); +} + +OutputEntry::OutputEntry(const self_type* const other){ + memcpy(this, other, sizeof(self_type)); +} + +// Comparator function +// Called from sort helper only +bool OutputEntry::operator<(const self_type& other) const{ + if (this->AcontigID < other.AcontigID) return true; + if (other.AcontigID < this->AcontigID) return false; + + if (this->Aposition < other.Aposition) return true; + if (other.Aposition < this->Aposition) return false; + + if (this->BcontigID < other.BcontigID) return true; + if (other.BcontigID < this->BcontigID) return false; + + if (this->Bposition < other.Bposition) return true; + if (other.Bposition < this->Bposition) return false; + + return false; +} + +bool OutputEntry::operator<=(const self_type& other) const{ + if (this->AcontigID <= other.AcontigID) return true; + if (other.AcontigID <= this->AcontigID) return false; + + if (this->Aposition <= other.Aposition) return true; + if (other.Aposition <= this->Aposition) return false; + + if (this->BcontigID <= other.BcontigID) return true; + if (other.BcontigID <= this->BcontigID) return false; + + if (this->Bposition <= other.Bposition) return true; + if (other.Bposition <= this->Bposition) return false; + + return true; +} + +bool OutputEntry::operator==(const self_type& other) const{ + if (this->AcontigID != other.AcontigID) return false; + if (this->Aposition != other.Aposition) return false; + if (this->BcontigID != other.BcontigID) return false; + if (this->Bposition != other.Bposition) return false; + return true; +} + +void OutputEntry::swapDirection(void){ + U32 Ac = this->AcontigID; + U32 Bc = this->BcontigID; + this->AcontigID = Bc; + this->BcontigID = Ac; + U32& A = *reinterpret_cast(((char*)this + sizeof(U16) + sizeof(U32))); + U32& B = *reinterpret_cast(((char*)this + sizeof(U16) + 3*sizeof(U32))); + std::swap(A,B); +} + +} +} diff --git a/src/tomahawk/two/output_entry.h b/src/tomahawk/two/output_entry.h new file mode 100644 index 0000000..623d25b --- /dev/null +++ b/src/tomahawk/two/output_entry.h @@ -0,0 +1,108 @@ +#ifndef TOMAHAWKOUTPUTENTRY_H_ +#define TOMAHAWKOUTPUTENTRY_H_ + +#include "../../io/BasicBuffer.h" +#include "../../index/index_contig.h" + +namespace Tomahawk{ +namespace IO{ + +/**< + * Primary data structure for Tomahawk-generated LD output. + * This higher-order primitive can be interpreted directly + * from a packed buffer (unaligned memory access) or + * explicitly by invoking a valid constructor + */ +#pragma pack(push, 1) +struct __attribute__((packed, aligned(1))) OutputEntry{ +public: + typedef OutputEntry self_type; + typedef Totempole::HeaderContig contig_type; + typedef IO::BasicBuffer buffer_type; + +public: + // if interpreted directly from buffer stream + OutputEntry() : + FLAGS(0), + AcontigID(0), + Amissing(false), + Aphased(false), + Aposition(0), + BcontigID(0), + Bmissing(false), + Bphased(false), + Bposition(0), + p1(0), p2(0), q1(0), q2(0), + D(0), Dprime(0), + R(0), R2(0), + P(0), + chiSqFisher(0), + chiSqModel(0) + { + + } + + ~OutputEntry() = default; + // Copy data from stream + OutputEntry(const char* const data_buffer); + // Copy data from stream + OutputEntry(const buffer_type& data_buffer); + OutputEntry(const self_type* const other); + + // Comparator function + // Called from sort helper only + inline static bool sortDescending(const self_type& a, const self_type& b){ return(a < b); } + inline static bool sortAscending(const self_type& a, const self_type& b){ return(a > b); } + bool operator< (const self_type& other) const; + bool operator<=(const self_type& other) const; + bool operator==(const self_type& other) const; + + // Comparator function: inverse of lesser comparator + inline bool operator> (const self_type& other) const{ return(!((*this) < other)); } + inline bool operator>=(const self_type& other) const{ return(!((*this) <= other)); } + + // Swaps cA,pA with cB,pB + // used in sorting for indices + void swapDirection(void); + + friend std::ostream& operator<<(std::ostream& os, const self_type& entry){ + os << std::setprecision(8) << (int)entry.FLAGS << '\t' << entry.AcontigID << '\t' << entry.Aposition << '\t' << entry.BcontigID << '\t' << entry.Bposition + << '\t' << entry.p1 << '\t' << entry.p2 << '\t' << entry.q1 << '\t' << entry.q2 << '\t' << entry.D << '\t' << entry.Dprime + << '\t' << entry.R << '\t' << entry.R2 << '\t' << entry.P << '\t' << entry.chiSqFisher << '\t' << entry.chiSqModel; + + return(os); + } + + std::ostream& write(std::ostream& os, const contig_type* const contigs) const{ + os << std::setprecision(8) << (int)this->FLAGS << '\t' << contigs[this->AcontigID].name << '\t' << this->Aposition << '\t' << contigs[this->BcontigID].name << '\t' << this->Bposition + << '\t' << this->p1 << '\t' << this->p2 << '\t' << this->q1 << '\t' << this->q2 << '\t' << this->D << '\t' << this->Dprime + << '\t' << this->R << '\t' << this->R2 << '\t' << this->P << '\t' << this->chiSqFisher << '\t' << this->chiSqModel << '\n'; + + return(os); + } + + // Write to buffer + friend buffer_type& operator<<(buffer_type& b, const self_type& entry){ + b.Add(reinterpret_cast(&entry), sizeof(self_type)); + return(b); + } + +public: + U16 FLAGS; + U32 AcontigID; + U32 Amissing: 1, Aphased: 1, Aposition: 30; + U32 BcontigID; + U32 Bmissing: 1, Bphased: 1, Bposition: 30; + float p1, p2, q1, q2; + float D, Dprime; // D and D' + float R, R2; // Correlation coefficient + double P; // P-value + double chiSqFisher; + double chiSqModel; +}; +#pragma pack(pop) + +} +} + +#endif /* TOMAHAWKOUTPUTENTRY_H_ */ diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputLD.cpp b/src/tomahawk/two/output_entry_support.cpp similarity index 79% rename from src/tomahawk/TomahawkOutput/TomahawkOutputLD.cpp rename to src/tomahawk/two/output_entry_support.cpp index c30ce81..e1e5140 100644 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputLD.cpp +++ b/src/tomahawk/two/output_entry_support.cpp @@ -1,9 +1,9 @@ -#include "TomahawkOutputLD.h" +#include "../two/output_entry_support.h" namespace Tomahawk { namespace Support { -TomahawkOutputLD::TomahawkOutputLD() : +OutputEntrySupport::OutputEntrySupport() : controller(0), R2(0), D(0), @@ -16,10 +16,10 @@ TomahawkOutputLD::TomahawkOutputLD() : { } -TomahawkOutputLD::~TomahawkOutputLD(){ } +OutputEntrySupport::~OutputEntrySupport(){ } -void TomahawkOutputLD::operator=(const TomahawkOutputLD& other){ +void OutputEntrySupport::operator=(const OutputEntrySupport& other){ this->R2 = other.R2; this->D = other.D; this->Dprime = other.Dprime; @@ -29,14 +29,14 @@ void TomahawkOutputLD::operator=(const TomahawkOutputLD& other){ memcpy(&this->haplotypeCounts[0], &other.haplotypeCounts[0], sizeof(float)*4); } -void TomahawkOutputLD::printUnphasedCounts(void) const{ +void OutputEntrySupport::printUnphasedCounts(void) const{ // Prints 3x3 Punnett square for unphased data std::cerr << this->alleleCounts[0] << '\t' << this->alleleCounts[1] + this->alleleCounts[4] << '\t' << this->alleleCounts[5] << '\t' << this->alleleCounts[16] + this->alleleCounts[64] << '\t' << this->alleleCounts[17] + this->alleleCounts[20] + this->alleleCounts[65] + this->alleleCounts[68] << '\t' << this->alleleCounts[21] + this->alleleCounts[69] << '\t' << this->alleleCounts[80] << '\t' << this->alleleCounts[81]+this->alleleCounts[84] << '\t' << this->alleleCounts[85] << std::endl; } -void TomahawkOutputLD::printPhasedCounts(void) const{ +void OutputEntrySupport::printPhasedCounts(void) const{ std::cerr << this->alleleCounts[0] << '\t' << this->alleleCounts[1] << '\t' << this->alleleCounts[4] << '\t' << this->alleleCounts[5] << std::endl; } diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputLD.h b/src/tomahawk/two/output_entry_support.h similarity index 55% rename from src/tomahawk/TomahawkOutput/TomahawkOutputLD.h rename to src/tomahawk/two/output_entry_support.h index 9b2ee5e..95d6fa9 100644 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputLD.h +++ b/src/tomahawk/two/output_entry_support.h @@ -6,28 +6,32 @@ namespace Tomahawk { namespace Support{ -struct TomahawkOutputLD{ +struct OutputEntrySupport{ +private: + typedef OutputEntrySupport self_type; + typedef IO::BasicBuffer buffer_type; + public: - TomahawkOutputLD(); - ~TomahawkOutputLD(); + OutputEntrySupport(); + ~OutputEntrySupport(); inline void resetPhased(void){ - this->controller = 0; + this->controller = 0; this->alleleCounts[0] = 0; this->alleleCounts[1] = 0; this->alleleCounts[4] = 0; this->alleleCounts[5] = 0; - this->chiSqModel = 0; + this->chiSqModel = 0; // All other values can legally overflow // They are not used } inline void resetUnphased(void){ - this->controller = 0; - this->alleleCounts[0] = 0; - this->alleleCounts[1] = 0; - this->alleleCounts[4] = 0; - this->alleleCounts[5] = 0; + this->controller = 0; + this->alleleCounts[0] = 0; + this->alleleCounts[1] = 0; + this->alleleCounts[4] = 0; + this->alleleCounts[5] = 0; this->alleleCounts[16] = 0; this->alleleCounts[17] = 0; this->alleleCounts[20] = 0; @@ -46,11 +50,11 @@ struct TomahawkOutputLD{ inline float& operator[](const U32& p){ return(this->alleleCounts[p]); } inline const float& operator[](const U32& p) const{ return(this->alleleCounts[p]); } - void operator=(const TomahawkOutputLD& other); + void operator=(const OutputEntrySupport& other); void printUnphasedCounts(void) const; void printPhasedCounts(void) const; - friend IO::BasicBuffer& operator<<(IO::BasicBuffer& os, const TomahawkOutputLD& entry){ + friend IO::BasicBuffer& operator<<(buffer_type& os, const self_type& entry){ // Notice that CONTROLLER is written separately os += entry.alleleCounts[0]; os += entry.alleleCounts[1]; @@ -58,6 +62,7 @@ struct TomahawkOutputLD{ os += entry.alleleCounts[5]; os += entry.D; os += entry.Dprime; + os += entry.R; os += entry.R2; os += entry.P; os += entry.chiSqFisher; @@ -65,22 +70,22 @@ struct TomahawkOutputLD{ return os; } - friend std::ostream& operator<<(std::ostream& os, const TomahawkOutputLD& entry){ + friend std::ostream& operator<<(std::ostream& os, const self_type& entry){ os << entry.alleleCounts[0] << '\t' << entry.alleleCounts[1] << '\t' << entry.alleleCounts[4] << '\t' << entry.alleleCounts[5] << '\t' - << entry.D << '\t' << entry.Dprime << '\t' << entry.R2 << '\t' << entry.P << '\t' << entry.chiSqFisher; + << entry.D << '\t' << entry.Dprime << '\t' << entry.R << '\t' << entry.R2 << '\t' << entry.P << '\t' << entry.chiSqFisher; return os; } - inline void setPhased(void){ this->controller |= 1; } - inline void setHasMissingValues(void){ this->controller |= 2; } - inline void setIncomplete(void){ this->controller |= 4; } - inline void setMultipleRoots(void){ this->controller |= 8; } - inline void setSameContig(void){ this->controller |= 16; } - inline void setLongRange(void){ this->controller |= 32; } - inline void setFailedHWEA(void){ this->controller |= 64; } - inline void setFailedHWEB(void){ this->controller |= 128; } - inline void setLowMAFA(void){ this->controller |= 256; } - inline void setLowMAFB(void){ this->controller |= 512; } + inline void setPhased(void) { this->controller |= 1; } + inline void setHasMissingValues(void){ this->controller |= 2; } + inline void setIncomplete(void) { this->controller |= 4; } + inline void setMultipleRoots(void) { this->controller |= 8; } + inline void setSameContig(void) { this->controller |= 16; } + inline void setLongRange(void) { this->controller |= 32; } + inline void setFailedHWEA(void) { this->controller |= 64; } + inline void setFailedHWEB(void) { this->controller |= 128; } + inline void setLowMAFA(void) { this->controller |= 256; } + inline void setLowMAFB(void) { this->controller |= 512; } inline const float countAlternatives(void) const{ // Find largest @@ -100,19 +105,17 @@ struct TomahawkOutputLD{ } public: - U16 controller; // FLAG byte - float R2; // R squared - float D; // D - float Dprime; // D' - float Dmax; // Dmax - double P; // Fisher or Chi-Squared P value for 2x2 contingency table - double chiSqModel; // Chi-Squared critical value for 3x3 contingency table - double chiSqFisher; // Chi-Squared critical value for 2x2 contingency table - float totalAlleleCounts; // Total number of alleles + U16 controller; // FLAG byte + float R, R2; // Correlation coefficients + float D, Dprime, Dmax; // D values + double P; // Fisher or Chi-Squared P value for 2x2 contingency table + double chiSqModel; // Chi-Squared critical value for 3x3 contingency table + double chiSqFisher; // Chi-Squared critical value for 2x2 contingency table + float totalAlleleCounts; // Total number of alleles // Counters - float alleleCounts[171]; - float haplotypeCounts[4]; + float alleleCounts[171]; + float haplotypeCounts[4]; }; } /* namespace Support */ diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputFilterController.cpp b/src/tomahawk/two/output_filter.cpp similarity index 84% rename from src/tomahawk/TomahawkOutput/TomahawkOutputFilterController.cpp rename to src/tomahawk/two/output_filter.cpp index eae0bae..148ae16 100644 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputFilterController.cpp +++ b/src/tomahawk/two/output_filter.cpp @@ -1,10 +1,10 @@ #include #include -#include "TomahawkOutputFilterController.h" +#include "output_filter.h" namespace Tomahawk { -TomahawkOutputFilterController::TomahawkOutputFilterController() : +OutputFilter::OutputFilter() : any_filter_user_set(false), minP1(0), minP2(0), @@ -26,9 +26,9 @@ TomahawkOutputFilterController::TomahawkOutputFilterController() : filterValueExclude(0) {} -TomahawkOutputFilterController::~TomahawkOutputFilterController(){} +OutputFilter::~OutputFilter(){} -std::string TomahawkOutputFilterController::getInterpretedString(void) const{ +std::string OutputFilter::getInterpretedString(void) const{ if(this->any_filter_user_set){ return(std::string( "minP1=" + std::to_string(this->minP1) + " " + @@ -61,7 +61,7 @@ std::string TomahawkOutputFilterController::getInterpretedString(void) const{ } } -bool TomahawkOutputFilterController::filter(const entry_type& target) const{ +bool OutputFilter::filter(const entry_type& target) const{ if(((target.FLAGS & this->filterValueInclude) != this->filterValueInclude) || ((target.FLAGS & this->filterValueExclude) != 0)) return false; @@ -80,11 +80,11 @@ bool TomahawkOutputFilterController::filter(const entry_type& target) const{ return true; } -bool TomahawkOutputFilterController::filterHF(const entry_type& target) const{ +bool OutputFilter::filterHF(const entry_type& target) const{ return(target.p1 >= this->minP1 || target.p2 >= this->minP2 || target.q1 >= this->minQ1 || target.q2 >= this->minQ2); } -bool TomahawkOutputFilterController::filterJointHF(const entry_type& target) const{ +bool OutputFilter::filterJointHF(const entry_type& target) const{ // find largest const float* max = &target.p1; if(target.p2 > *max) max = &target.p2; @@ -101,7 +101,7 @@ bool TomahawkOutputFilterController::filterJointHF(const entry_type& target) con return(total > this->minMHF); } -bool TomahawkOutputFilterController::setFilterTable(const double& a, const double& b, const double& c, const double& d){ +bool OutputFilter::setFilterTable(const double& a, const double& b, const double& c, const double& d){ if(a < 0 || b < 0 || c < 0 || d < 0){ std::cerr << "cannot have negative filter values" << std::endl; return false; @@ -120,7 +120,7 @@ bool TomahawkOutputFilterController::setFilterTable(const double& a, const doubl return true; } -bool TomahawkOutputFilterController::setFilterTable(const double& all){ +bool OutputFilter::setFilterTable(const double& all){ if(all < 0){ std::cerr << "cannot have negative filter values" << std::endl; return false; @@ -137,7 +137,7 @@ bool TomahawkOutputFilterController::setFilterTable(const double& all){ return true; } -bool TomahawkOutputFilterController::setFilterD(const float& min, const float& max){ +bool OutputFilter::setFilterD(const float& min, const float& max){ if(max < min){ std::cerr << "max < min" << std::endl; return false; @@ -157,7 +157,7 @@ bool TomahawkOutputFilterController::setFilterD(const float& min, const float& m return true; } -bool TomahawkOutputFilterController::setFilterDprime(const float& min, const float& max){ +bool OutputFilter::setFilterDprime(const float& min, const float& max){ if(max < min){ std::cerr << "max < min" << std::endl; return false; @@ -177,7 +177,7 @@ bool TomahawkOutputFilterController::setFilterDprime(const float& min, const flo return true; } -bool TomahawkOutputFilterController::setFilterRsquared(const float& min, const float& max){ +bool OutputFilter::setFilterRsquared(const float& min, const float& max){ if(max < min){ std::cerr << "max < min" << std::endl; return false; @@ -197,7 +197,7 @@ bool TomahawkOutputFilterController::setFilterRsquared(const float& min, const f return true; } -bool TomahawkOutputFilterController::setFilterP(const double& min, const double& max){ +bool OutputFilter::setFilterP(const double& min, const double& max){ if(max < min){ std::cerr << "max < min" << std::endl; return false; @@ -217,7 +217,7 @@ bool TomahawkOutputFilterController::setFilterP(const double& min, const double& return true; } -bool TomahawkOutputFilterController::setFilterPmodel(const double& min, const double& max){ +bool OutputFilter::setFilterPmodel(const double& min, const double& max){ if(max < min){ std::cerr << "max < min" << std::endl; return false; @@ -237,7 +237,7 @@ bool TomahawkOutputFilterController::setFilterPmodel(const double& min, const do return true; } -bool TomahawkOutputFilterController::setFilterChiSquared(const double& min, const double& max){ +bool OutputFilter::setFilterChiSquared(const double& min, const double& max){ if(max < min){ std::cerr << "max < min" << std::endl; return false; @@ -253,7 +253,7 @@ bool TomahawkOutputFilterController::setFilterChiSquared(const double& min, cons return true; } -bool TomahawkOutputFilterController::setFilterMHF(const double& min, const double& max){ +bool OutputFilter::setFilterMHF(const double& min, const double& max){ if(min < 0){ std::cerr << "min < 0" << std::endl; return false; diff --git a/src/tomahawk/TomahawkOutput/TomahawkOutputFilterController.h b/src/tomahawk/two/output_filter.h similarity index 82% rename from src/tomahawk/TomahawkOutput/TomahawkOutputFilterController.h rename to src/tomahawk/two/output_filter.h index 59c63db..610681b 100644 --- a/src/tomahawk/TomahawkOutput/TomahawkOutputFilterController.h +++ b/src/tomahawk/two/output_filter.h @@ -5,20 +5,20 @@ #include #include -#include "../../support/TypeDefinitions.h" -#include "TomahawkOutputEntry.h" +#include "../../support/type_definitions.h" #include "../../support/MagicConstants.h" +#include "../two/output_entry.h" namespace Tomahawk { -class TomahawkOutputFilterController { - typedef TomahawkOutputFilterController self_type; - typedef IO::TomahawkOutputEntry entry_type; +class OutputFilter { + typedef OutputFilter self_type; + typedef IO::OutputEntry entry_type; typedef bool (self_type::*filterFunction)(const entry_type& entry) const; public: - TomahawkOutputFilterController(); - ~TomahawkOutputFilterController(); + OutputFilter(); + ~OutputFilter(); inline const bool& isAnySet(void) const{ return(this->any_filter_user_set); } @@ -58,20 +58,20 @@ class TomahawkOutputFilterController { double minP1, minP2, minQ1, minQ2; double maxP1, maxP2, maxQ1, maxQ2; double minMHF, maxMHF; - float minD, maxD; - float minDprime, maxDprime; + float minD, maxD; + float minDprime, maxDprime; double minR2, maxR2; double minP, maxP; double minChiSquared, maxChiSquared; double minPmodel, maxPmodel; - U16 filterValueInclude; - U16 filterValueExclude; + U16 filterValueInclude; + U16 filterValueExclude; std::vector filter_functions; // push filter functions to array and loop over }; template -inline bool TomahawkOutputFilterController::filter(const T& value, const Y& min, const Y& max) const{ +inline bool OutputFilter::filter(const T& value, const Y& min, const Y& max) const{ if(value < min || value > max) return false; diff --git a/src/totempole/TotempoleContig.h b/src/totempole/TotempoleContig.h deleted file mode 100644 index 3dcb058..0000000 --- a/src/totempole/TotempoleContig.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef TOTEMPOLECONTIG_H_ -#define TOTEMPOLECONTIG_H_ - -#include -#include - -namespace Tomahawk{ -namespace Totempole{ - -struct TotempoleContigBase{ - typedef TotempoleContigBase self_type; - -public: - TotempoleContigBase(const U32& bases, const U32& n_char, const std::string& name) : bases(bases), n_char(n_char), name(name){} - TotempoleContigBase() : bases(0), n_char(0){} - ~TotempoleContigBase(){} - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream << entry.bases << '\t' << entry.n_char << '\t' << entry.name; - return stream; - } - - friend std::ofstream& operator<<(std::ofstream& stream, const self_type& base){ - stream.write(reinterpret_cast(&base.bases), sizeof(U32)); - stream.write(reinterpret_cast(&base.n_char), sizeof(U32)); - stream.write(reinterpret_cast(&base.name[0]), base.name.size()); - return(stream); - } - - friend std::istream& operator>>(std::istream& stream, self_type& base){ - stream.read(reinterpret_cast(&base.bases), sizeof(U32)); - stream.read(reinterpret_cast(&base.n_char), sizeof(U32)); - base.name.resize(base.n_char); - stream.read(&base.name[0], base.n_char); - return(stream); - } - -public: - U32 bases; // length of contig - U32 n_char; // number of chars - std::string name; // contig name -}; - -struct TotempoleContig : public TotempoleContigBase{ -typedef TotempoleContig self_type; - -public: - TotempoleContig() : minPosition(0), maxPosition(0), blocksStart(0), blocksEnd(0){} - ~TotempoleContig(){} - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream << entry.name << '\t' << entry.bases << '\t' << entry.minPosition << "-" << entry.maxPosition << '\t' << entry.blocksStart << "->" << entry.blocksEnd; - return stream; - } - - // Updated second when read - // contigID is implicit - U32 minPosition; // start position of contig - U32 maxPosition; // end position of contig - U32 blocksStart; // start IO-seek position of blocks - U32 blocksEnd; // end IO-seek position of blocks -}; - -} -} - - - - -#endif /* TOTEMPOLECONTIG_H_ */ diff --git a/src/totempole/TotempoleEntry.h b/src/totempole/TotempoleEntry.h deleted file mode 100644 index bda0f36..0000000 --- a/src/totempole/TotempoleEntry.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef TOTEMPOLEENTRY_H_ -#define TOTEMPOLEENTRY_H_ - -namespace Tomahawk{ -namespace Totempole{ - -#pragma pack(1) -struct TotempoleEntry{ - typedef TotempoleEntry self_type; - -public: - TotempoleEntry() : byte_offset(0), byte_offset_end(0), contigID(0), minPosition(0), maxPosition(0), variants(0), uncompressed_size(0){} - ~TotempoleEntry(){} - - inline bool isValid(void) const{ return(this->byte_offset != 0); } - inline void operator++(void){ ++this->variants; } - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream << entry.byte_offset << '\t' << entry.byte_offset_end << '\t' << entry.contigID << '\t' << entry.minPosition << '-' << entry.maxPosition << '\t' << entry.variants << '\t' << entry.uncompressed_size; - return stream; - } - - friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){ - stream.write(reinterpret_cast(&entry.byte_offset), sizeof(U64)); - stream.write(reinterpret_cast(&entry.byte_offset_end), sizeof(U64)); - stream.write(reinterpret_cast(&entry.contigID), sizeof(U32)); - stream.write(reinterpret_cast(&entry.minPosition), sizeof(U32)); - stream.write(reinterpret_cast(&entry.maxPosition), sizeof(U32)); - stream.write(reinterpret_cast(&entry.variants), sizeof(U16)); - stream.write(reinterpret_cast(&entry.uncompressed_size), sizeof(U32)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& entry){ - stream.read(reinterpret_cast(&entry.byte_offset), sizeof(U64)); - stream.read(reinterpret_cast(&entry.byte_offset_end), sizeof(U64)); - stream.read(reinterpret_cast(&entry.contigID), sizeof(U32)); - stream.read(reinterpret_cast(&entry.minPosition), sizeof(U32)); - stream.read(reinterpret_cast(&entry.maxPosition), sizeof(U32)); - stream.read(reinterpret_cast(&entry.variants), sizeof(U16)); - stream.read(reinterpret_cast(&entry.uncompressed_size), sizeof(U32)); - - return(stream); - } - - void reset(void){ - this->byte_offset = 0; - this->byte_offset_end = 0; - this->contigID = 0; - this->minPosition = 0; - this->maxPosition = 0; - this->variants = 0; - this->uncompressed_size = 0; - } - -public: - U64 byte_offset; // tellg() position in stream for start of record in Tomahawk file - U64 byte_offset_end;// tellg() position in stream for start of record in Tomahawk file - S32 contigID; // contig identifier - U32 minPosition; // smallest bp position in tomahawk block - U32 maxPosition; // largest bp position in tomahawk block - U16 variants; // number of variants in this block - U32 uncompressed_size; // uncompressed size of this block -}; - -} -} - -#endif /* TOTEMPOLEENTRY_H_ */ diff --git a/src/totempole/TotempoleHeader.h b/src/totempole/TotempoleHeader.h deleted file mode 100644 index 29b4b01..0000000 --- a/src/totempole/TotempoleHeader.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef TOTEMPOLEHEADER_H_ -#define TOTEMPOLEHEADER_H_ - -#include - -namespace Tomahawk { -namespace Totempole { - -struct TotempoleHeaderBase{ - typedef TotempoleHeaderBase self_type; - -public: - TotempoleHeaderBase() : version(0), samples(0){} - TotempoleHeaderBase(const U64 samples) : - version(Constants::PROGRAM_VERSION), - samples(samples) - {} - - friend std::ostream& operator<<(std::ofstream& stream, const self_type& header){ - stream.write(reinterpret_cast(&Constants::PROGRAM_VERSION), sizeof(float)); - stream.write(reinterpret_cast(&header.samples), sizeof(U64)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - stream.read(reinterpret_cast(&header.version), sizeof(float)); - stream.read(reinterpret_cast(&header.samples), sizeof(U64)); - return(stream); - } - -public: - float version; // version used to write header - U64 samples; // number of samples -}; - -struct TotempoleHeader : public TotempoleHeaderBase{ - typedef TotempoleHeader self_type; - -public: - TotempoleHeader() : - controller(0), - blocks(0), - largest_uncompressed(0), - offset(0) - {} - - // This ctor is used during construction - // only possible when sample count is known - TotempoleHeader(const U64 samples) : - TotempoleHeaderBase(samples), - controller(0), - blocks(0), - largest_uncompressed(0), - offset(0) - {} - ~TotempoleHeader(){} - - friend std::ostream& operator<<(std::ofstream& stream, const self_type& header){ - // version | sample count | controller byte | blocks | largest uncompressed - stream.write(reinterpret_cast(&header.version), sizeof(float)); - stream.write(reinterpret_cast(&header.samples), sizeof(U64)); - stream.write(reinterpret_cast(&header.controller), sizeof(BYTE)); // Controller byte - // At end-of-file, reopen file as in | out | binary and seek to this position and overwrite with the correct position - stream.write(reinterpret_cast(&header.blocks), sizeof(U32)); // Number of blocks in Tomahawk - stream.write(reinterpret_cast(&header.largest_uncompressed), sizeof(U32)); // Size of largest uncompressed block - return(stream); - } - - friend std::ostream& operator<<(std::ostream& os, const self_type& block){ - os << - "version: " << block.version << '\n' << - "samples: " << block.samples << '\n' << - "controller: " << std::bitset<8>(block.controller) << '\n' << - "blocks: " << block.blocks << '\n' << - "largest: " << block.largest_uncompressed << '\n' << - "offset: " << block.offset; - - return(os); - } - - friend std::istream& operator>>(std::istream& stream, self_type& block){ - stream.read(reinterpret_cast(&block.version), sizeof(float)); - stream.read(reinterpret_cast(&block.samples), sizeof(U64)); - stream.read(reinterpret_cast(&block.controller), sizeof(BYTE)); - stream.read(reinterpret_cast(&block.blocks), sizeof(U32)); - stream.read(reinterpret_cast(&block.largest_uncompressed), sizeof(U32)); - stream.read(reinterpret_cast(&block.offset), sizeof(U32)); - - return(stream); - } - -public: - BYTE controller; // controller block - U32 blocks; // number of blocks in Tomahawk - U32 largest_uncompressed; // largest block-size in bytes - U32 offset; // IO disk offset for start of data -}; - -} -} - -#endif /* TOTEMPOLEHEADER_H_ */ diff --git a/src/totempole/TotempoleMagic.h b/src/totempole/TotempoleMagic.h deleted file mode 100644 index 07220fc..0000000 --- a/src/totempole/TotempoleMagic.h +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef TOTEMPOLEMAGIC_H_ -#define TOTEMPOLEMAGIC_H_ - -#include -#include -#include -#include "TotempoleOutputEntry.h" -#include "../support/MagicConstants.h" - -namespace Tomahawk{ -namespace IO{ - -template -struct MAGICBase{ - typedef MAGICBase self_type; - -public: - MAGICBase(){} // for reading - MAGICBase(const char* target){ memcpy(&this->MAGIC[0], target, length); } // for writing - MAGICBase(const self_type& other){ memcpy(&this->MAGIC[0], &other.MAGIC[0], length); } - virtual ~MAGICBase(){} - - friend std::istream& operator>>(std::istream& stream, self_type& base){ - stream.read(base.MAGIC, length); - return(stream); - } - - friend std::ostream& operator<<(std::ofstream& stream, const self_type& base){ - stream.write(base.MAGIC, length); - return stream; - } - - virtual inline bool validate(const char* match) const{ return(strncmp(&this->MAGIC[0], match, length) == 0); } - -public: - char MAGIC[length]; -}; - -template -struct TomahawkHeader : public MAGICBase{ - typedef TomahawkHeader self_type; - typedef MAGICBase parent_type; - - TomahawkHeader() : version(0), samples(0){} // for reading - TomahawkHeader(const char* target, const U64 samples) : - version(Tomahawk::Constants::PROGRAM_VERSION), - samples(samples) - { - memcpy(&this->MAGIC[0], target, length); - } // for writing - - friend std::ostream& operator<<(std::ofstream& stream, const self_type& header){ - stream.write(header.MAGIC, length); - stream.write(reinterpret_cast(&Tomahawk::Constants::PROGRAM_VERSION), sizeof(float)); - stream.write(reinterpret_cast(&header.samples), sizeof(U64)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - stream.read(header.MAGIC, length); - stream.read(reinterpret_cast(&header.version), sizeof(float)); - stream.read(reinterpret_cast(&header.samples), sizeof(U64)); - return(stream); - } - -public: - float version; - U64 samples; -}; - -template -struct TomahawkOutputHeader : public TomahawkHeader{ - typedef TomahawkOutputHeader self_type; - typedef TomahawkHeader parent_type; - - TomahawkOutputHeader() : n_contig(0), n_entries(0){} // for reading - TomahawkOutputHeader(const char* target, const U64 samples, const U32 n_contigs) : - parent_type(target, samples), - n_contig(n_contigs), - n_entries(0) - { - memcpy(&this->MAGIC[0], target, length); - } // for writing - - friend std::ostream& operator<<(std::ofstream& stream, const self_type& header){ - stream.write(header.MAGIC, length); - stream.write(reinterpret_cast(&Tomahawk::Constants::PROGRAM_VERSION), sizeof(float)); - stream.write(reinterpret_cast(&header.samples), sizeof(U64)); - stream.write(reinterpret_cast(&header.n_contig), sizeof(U32)); - stream.write(reinterpret_cast(&header.n_entries), sizeof(U32)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - stream.read(header.MAGIC, length); - stream.read(reinterpret_cast(&header.version), sizeof(float)); - stream.read(reinterpret_cast(&header.samples), sizeof(U64)); - stream.read(reinterpret_cast(&header.n_contig), sizeof(U32)); - - // Legacy fix - if(header.version >= 0.2) - stream.read(reinterpret_cast(&header.n_entries), sizeof(U32)); - else - header.n_entries = 0; - - return(stream); - } - -public: - U32 n_contig; - U32 n_entries; -}; - -template -struct TomahawkOutputSortHeader : public TomahawkOutputHeader{ - typedef TomahawkOutputSortHeader self_type; - typedef TomahawkOutputHeader parent_type; - typedef Totempole::TotempoleOutputEntryController totempole_controller_byte; - - TomahawkOutputSortHeader(){} // for reading - TomahawkOutputSortHeader(const char* target, const U64 samples, const U32 n_contigs) : - parent_type(target, samples, n_contigs) - { - memcpy(&this->MAGIC[0], target, length); - } // for writing - - friend std::ostream& operator<<(std::ofstream& stream, const self_type& header){ - stream.write(header.MAGIC, length); - stream.write(reinterpret_cast(&Tomahawk::Constants::PROGRAM_VERSION), sizeof(float)); - stream.write(reinterpret_cast(&header.samples), sizeof(U64)); - stream.write(reinterpret_cast(&header.n_contig), sizeof(U32)); - stream.write(reinterpret_cast(&header.n_entries), sizeof(U32)); - stream.write(reinterpret_cast(&header.controller), sizeof(BYTE)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - stream.read(header.MAGIC, length); - stream.read(reinterpret_cast(&header.version), sizeof(float)); - stream.read(reinterpret_cast(&header.samples), sizeof(U64)); - stream.read(reinterpret_cast(&header.n_contig), sizeof(U32)); - stream.read(reinterpret_cast(&header.n_entries), sizeof(U32)); - stream.read(reinterpret_cast(&header.controller), sizeof(BYTE)); - return(stream); - } - - inline void setSorted(const bool yes){ this->controller.sorted = yes; } - -public: - totempole_controller_byte controller; -}; - -} -} - -#endif /* TOTEMPOLEMAGIC_H_ */ diff --git a/src/totempole/TotempoleOutputEntry.h b/src/totempole/TotempoleOutputEntry.h deleted file mode 100644 index c8f4e0b..0000000 --- a/src/totempole/TotempoleOutputEntry.h +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef TOTEMPOLEOUTPUTENTRY_H_ -#define TOTEMPOLEOUTPUTENTRY_H_ - -#include "../support/TypeDefinitions.h" - -namespace Tomahawk { -namespace Totempole { - -#pragma pack(1) -struct TotempoleOutputEntryController{ - typedef TotempoleOutputEntryController self_type; - - BYTE sorted: 1, expanded: 1, partial_sort: 1, unused: 5; - - friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){ - stream.write((const char*)reinterpret_cast(&entry), 1); - return(stream); - } - - friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry){ - stream.read(reinterpret_cast(&entry), 1); - return(stream); - } -}; - -#pragma pack(1) -struct TotempoleOutputEntry{ - typedef TotempoleOutputEntry self_type; - -public: - TotempoleOutputEntry() : - byte_offset(0), - byte_offset_end(0), - entries(0), - uncompressed_size(0) - {} - ~TotempoleOutputEntry(){} - - inline const bool isValid(void) const{ return(this->byte_offset != 0); } - inline void operator++(void){ ++this->entries; } - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream << entry.byte_offset << '-' << entry.byte_offset_end << '\t' << entry.entries << '\t' - << entry.uncompressed_size; - return stream; - } - - friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){ - stream.write(reinterpret_cast(&entry.byte_offset), sizeof(U64)); - stream.write(reinterpret_cast(&entry.byte_offset_end), sizeof(U64)); - stream.write(reinterpret_cast(&entry.entries), sizeof(U32)); - stream.write(reinterpret_cast(&entry.uncompressed_size), sizeof(U32)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& entry){ - stream.read(reinterpret_cast(&entry.byte_offset), sizeof(U64)); - stream.read(reinterpret_cast(&entry.byte_offset_end), sizeof(U64)); - stream.read(reinterpret_cast(&entry.entries), sizeof(U32)); - stream.read(reinterpret_cast(&entry.uncompressed_size), sizeof(U32)); - - return(stream); - } - - void reset(void){ - this->byte_offset = 0; - this->byte_offset_end = 0; - this->entries = 0; - this->uncompressed_size = 0; - } - -public: - U64 byte_offset; // tellg() position in stream for start of record in Tomahawk file - U64 byte_offset_end; // tellg() position in stream for start of record in Tomahawk file - U32 entries; // number of variants in this block - U32 uncompressed_size; // uncompressed size of this block -}; - -#pragma pack(1) -struct TotempoleOutputSortedEntry{ - typedef TotempoleOutputSortedEntry self_type; - -public: - TotempoleOutputSortedEntry() : - //contigID(0), - fromBlock(-1), - fromBlock_entries_offset(0), - toBlock(-1), - toBlock_entries_offset(0) - {} - ~TotempoleOutputSortedEntry(){} - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream << entry.fromBlock << ':' << entry.fromBlock_entries_offset << "->" - << entry.toBlock << ':' << entry.toBlock_entries_offset; - return stream; - } - - friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){ - //stream.write(reinterpret_cast(&entry.contigID), sizeof(U32)); - stream.write(reinterpret_cast(&entry.fromBlock), sizeof(S32)); - stream.write(reinterpret_cast(&entry.fromBlock_entries_offset), sizeof(U32)); - stream.write(reinterpret_cast(&entry.toBlock), sizeof(S32)); - stream.write(reinterpret_cast(&entry.toBlock_entries_offset), sizeof(U32)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& entry){ - //stream.read(reinterpret_cast(&entry.contigID), sizeof(U32)); - stream.read(reinterpret_cast(&entry.fromBlock), sizeof(S32)); - stream.read(reinterpret_cast(&entry.fromBlock_entries_offset), sizeof(U32)); - stream.read(reinterpret_cast(&entry.toBlock), sizeof(S32)); - stream.read(reinterpret_cast(&entry.toBlock_entries_offset), sizeof(U32)); - - return(stream); - } - - void reset(void){ - //this->contigID = 0; - this->fromBlock = -1; - this->fromBlock_entries_offset = 0; - this->toBlock = -1; - this->toBlock_entries_offset = 0; - } - - inline void update(const U32& block, const U32& offset){ - this->toBlock = block; - this->toBlock_entries_offset = offset; - } - -public: - //U32 contigID; // tellg() position in stream for start of record in Tomahawk file - S32 fromBlock; // tellg() position in stream for start of record in Tomahawk file - U32 fromBlock_entries_offset; - S32 toBlock; // number of variants in this block - U32 toBlock_entries_offset; -}; - -} -} - -#endif /* TOTEMPOLEOUTPUTENTRY_H_ */ diff --git a/src/totempole/TotempoleOutputReader.cpp b/src/totempole/TotempoleOutputReader.cpp deleted file mode 100644 index dd38db5..0000000 --- a/src/totempole/TotempoleOutputReader.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#include "TotempoleOutputReader.h" - -namespace Tomahawk { -namespace Totempole { - -TotempoleOutputReader::TotempoleOutputReader() : ERROR_STATE(TOI_INIT), n_entries(0), entries(nullptr), index(nullptr){} -TotempoleOutputReader::~TotempoleOutputReader(){ - delete this->index; -} - -bool TotempoleOutputReader::Open(const std::string& input, const contig_type* contigs){ - this->stream.open(input, std::ios::in | std::ios::binary | std::ios::ate); - if(!this->stream.good()){ - //std::cerr << "failed does not exist" << std::endl; - this->ERROR_STATE = TOI_NO_EXIST; - return false; - } - - U64 filesize = this->stream.tellg(); - this->stream.seekg(0); - - this->stream >> this->header; - if(!this->header.validate(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC)){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "Incorrect header!" << std::endl; - this->ERROR_STATE = TOI_CORRUPTED; - exit(1); - } - - this->buffer.resize(filesize); - const U64 readUntil = this->header.n_entries * sizeof(entry_type); - - if(readUntil % sizeof(entry_type) != 0){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "Mangled data!" << std::endl; - this->ERROR_STATE = TOI_CORRUPTED; - exit(1); - } - - this->stream.read(this->buffer.data, readUntil); - this->entries = reinterpret_cast(this->buffer.data); - this->n_entries = readUntil / sizeof(entry_type); - - if(!(this->header.controller.sorted && this->header.controller.expanded)){ - if(this->stream.tellg() != filesize){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "Mangled data!" << std::endl; - this->ERROR_STATE = TOI_CORRUPTED; - exit(1); - } - } else { - this->index = new index_type(this->header.n_contig, contigs); - stream >> *this->index; - - if(!stream.good()){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "Corrupted data!" << std::endl; - exit(1); - } - - if(!stream.tellg() == filesize){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "Mangled data!" << std::endl; - exit(1); - } - } - - this->ERROR_STATE = TOI_OK; - return true; -} - -bool TotempoleOutputReader::__stateCheck(void) const{ - if(this->ERROR_STATE != TOI_OK){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "No primary index available..." << std::endl; - return false; - } - - if(!this->header.controller.sorted){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "Index is not sorted..." << std::endl; - return false; - } - - if(this->index->getState() != TotempoleOutputSortedIndex::TOI_SORTED_ERROR::TOI_SORTED_OK){ - std::cerr << Helpers::timestamp("ERROR", "TOI") << "No sorted index available..." << std::endl; - return false; - } - return true; -} - -bool TotempoleOutputReader::findOverlap(const U32 contigID, totempole_entry& intervals){ - if(!this->__stateCheck()) return false; - return(this->index->findOverlap(contigID, intervals)); -} - -bool TotempoleOutputReader::findOverlap(const U32 contigID, const U32 position, totempole_entry& intervals){ - if(!this->__stateCheck()) return false; - return(this->index->findOverlap(contigID, position, intervals)); -} - -bool TotempoleOutputReader::findOverlap(const U32 contigID, const U32 from, const U32 to, std::vector& intervals){ - if(!this->__stateCheck()) return false; - return(this->index->findOverlap(contigID, from, to, intervals)); -} - - -} /* namespace Totempole */ -} /* namespace Tomahawk */ diff --git a/src/totempole/TotempoleOutputReader.h b/src/totempole/TotempoleOutputReader.h deleted file mode 100644 index 83d9c0f..0000000 --- a/src/totempole/TotempoleOutputReader.h +++ /dev/null @@ -1,63 +0,0 @@ - -#ifndef SRC_TOTEMPOLE_TOTEMPOLEOUTPUTREADER_H_ -#define SRC_TOTEMPOLE_TOTEMPOLEOUTPUTREADER_H_ - -#include "../support/MagicConstants.h" -#include "TotempoleMagic.h" -#include "../io/BasicBuffer.h" -#include "TotempoleOutputSortedIndex.h" - -namespace Tomahawk { -namespace Totempole { - -class TotempoleOutputReader { - typedef TotempoleOutputReader self_type; - typedef Tomahawk::IO::TomahawkOutputSortHeader header_type; - typedef TotempoleOutputEntry entry_type; - typedef IO::BasicBuffer buffer_type; - typedef TotempoleOutputSortedIndex index_type; - typedef Totempole::TotempoleContigBase contig_type; - typedef TotempoleOutputSortedEntry totempole_entry; - -public: - enum TOI_ERROR {TOI_OK, TOI_NO_EXIST, TOI_CORRUPTED, TOI_INIT}; - -public: - TotempoleOutputReader(); - ~TotempoleOutputReader(); - - bool Open(const std::string& input, const contig_type* contigs); - - inline const entry_type& operator[](const U32 p) const{ return(this->entries[p]); } - inline bool getIsSorted(void) const{ return(this->header.controller.sorted); } - inline bool getIsSortedExpanded(void) const{ return(this->header.controller.sorted && this->header.controller.expanded); } - inline const U32& size(void) const{ return(this->n_entries); } - - // TOI dispatchers - // Find data blocks mapping to these regions - bool findOverlap(const U32 contigID, totempole_entry& intervals); - bool findOverlap(const U32 contigID, const U32 position, totempole_entry& intervals); - bool findOverlap(const U32 contigID, const U32 from, const U32 to, std::vector& intervals); - - header_type& getHeader(void){ return(this->header); } - const index_type* const getIndex(void) const{ return(this->index); } - -private: - bool __stateCheck(void) const; - -public: - TOI_ERROR ERROR_STATE; - -private: - U32 n_entries; - std::ifstream stream; - header_type header; - buffer_type buffer; - const entry_type* entries; - index_type* index; -}; - -} /* namespace Totempole */ -} /* namespace Tomahawk */ - -#endif /* SRC_TOTEMPOLE_TOTEMPOLEOUTPUTREADER_H_ */ diff --git a/src/totempole/TotempoleOutputSortedIndex.cpp b/src/totempole/TotempoleOutputSortedIndex.cpp deleted file mode 100644 index 44c8cd3..0000000 --- a/src/totempole/TotempoleOutputSortedIndex.cpp +++ /dev/null @@ -1,166 +0,0 @@ -#include "TotempoleOutputSortedIndex.h" - -namespace Tomahawk { -namespace Totempole { - -TotempoleOutputSortedIndex::TotempoleOutputSortedIndex(const U32 n_contigs, const contig_type* const contigs) : - state(TOI_SORTED_INIT), - n_contigs(n_contigs), - prev_contigIDA(n_contigs + 1), // init to impossible id - prev_chunk(0), - contigs(contigs), - linear_index(new chunk_type[n_contigs]), - secondary_index(new totempole_entry[n_contigs]) -{ - for(U32 i = 0; i < this->n_contigs; ++i){ - this->linear_index[i].allocate(TOTEMPOLE_OUTPUT_SORT_CHUNKS); - } -} - -TotempoleOutputSortedIndex::~TotempoleOutputSortedIndex(){ - delete [] this->linear_index; - delete [] this->secondary_index; -} - - -void TotempoleOutputSortedIndex::update(const two_entry& entry, const U32& block, const U32& blockOffset){ - const U32 chunk = this->getBin(entry.AcontigID, entry.Aposition); - - // Switch in chromosome - if(entry.AcontigID != this->prev_contigIDA){ - // This is only ever NOT TRUE for the first entry - if(prev_contigIDA != this->n_contigs + 1){ - // Update previous block data with end positions - totempole_entry& temp = this->linear_index[this->prev_contigIDA][this->prev_chunk]; - temp.toBlock = block; - temp.toBlock_entries_offset = blockOffset; - - // Update current block data with start positions - totempole_entry& temp2 = this->linear_index[entry.AcontigID][chunk]; - temp2.fromBlock = block; - temp2.fromBlock_entries_offset = blockOffset; - - // Secondary - this->secondary_index[this->prev_contigIDA].toBlock = block; - this->secondary_index[this->prev_contigIDA].toBlock_entries_offset = blockOffset; - this->secondary_index[entry.AcontigID].fromBlock = block; - this->secondary_index[entry.AcontigID].fromBlock_entries_offset = blockOffset; - } else { - // For the first entry - const U32 chunk = this->getBin(entry.AcontigID, entry.Aposition); - totempole_entry& temp = this->linear_index[entry.AcontigID][chunk]; - temp.fromBlock = block; - temp.fromBlock_entries_offset = blockOffset; - - this->secondary_index[entry.AcontigID].fromBlock = block; - this->secondary_index[entry.AcontigID].fromBlock_entries_offset = blockOffset; - } - } - // Switch in chunk - else if(chunk != this->prev_chunk){ - this->linear_index[this->prev_contigIDA][this->prev_chunk].toBlock = block; - this->linear_index[this->prev_contigIDA][this->prev_chunk].toBlock_entries_offset = blockOffset; - this->linear_index[entry.AcontigID][chunk].fromBlock = block; - this->linear_index[entry.AcontigID][chunk].fromBlock_entries_offset = blockOffset; - } - - // in case there is no switch then last entry is +1 (non-closed interval) - this->linear_index[entry.AcontigID][chunk].update(block, blockOffset + 1); - this->secondary_index[entry.AcontigID].update(block, blockOffset + 1); - this->prev_contigIDA = entry.AcontigID; - this->prev_chunk = chunk; -} - - -bool TotempoleOutputSortedIndex::findOverlap(const U32 contigID, totempole_entry& intervals){ - if(this->secondary_index[contigID].fromBlock == -1) - return false; - - intervals = this->secondary_index[contigID]; - - return true; -} - -bool TotempoleOutputSortedIndex::findOverlap(const U32 contigID, const U32 position, totempole_entry& intervals){ - if(this->secondary_index[contigID].fromBlock == -1) - return false; - - const U32 chunk = position / ((this->contigs[contigID].bases >> TOTEMPOLE_OUTPUT_SORT_SHIFT) + 1); - - // Impossible chunk - if(chunk >= TOTEMPOLE_OUTPUT_SORT_CHUNKS) - return false; - - if(this->linear_index[contigID][chunk].fromBlock == -1) - return false; - - intervals = this->linear_index[contigID][chunk]; - return true; -} - -bool TotempoleOutputSortedIndex::findOverlap(const U32 contigID, const U32 from, const U32 to, std::vector& intervals){ - if(this->secondary_index[contigID].fromBlock == -1) - return false; - - const U32 chunkFrom = from / ((this->contigs[contigID].bases >> TOTEMPOLE_OUTPUT_SORT_SHIFT) + 1); - U32 chunkTo = to / ((this->contigs[contigID].bases >> TOTEMPOLE_OUTPUT_SORT_SHIFT) + 1); - - if(chunkFrom >= TOTEMPOLE_OUTPUT_SORT_CHUNKS) - return false; - - if(chunkTo >= TOTEMPOLE_OUTPUT_SORT_CHUNKS) - chunkTo = TOTEMPOLE_OUTPUT_SORT_CHUNKS - 1; - - // Special case when both values fall into the same chunk - if(chunkFrom == chunkTo){ - if(this->linear_index[contigID][chunkFrom].fromBlock == -1) - return false; - - intervals.push_back(this->linear_index[contigID][chunkFrom]); - return true; - } - - totempole_entry entry; - U32 i = chunkFrom; - U32 prevHit = 0; - // Get first - for(; i < chunkTo; ++i){ - if(this->linear_index[contigID][i].fromBlock != -1){ - entry = this->linear_index[contigID][i]; - prevHit = i; - break; - } - } - ++i; - - // Continue until end - for(; i < chunkTo; ++i){ - if(this->linear_index[contigID][i].fromBlock != -1){ - // extend - if(i - prevHit == 1){ - entry.toBlock = this->linear_index[contigID][i].toBlock; - entry.toBlock_entries_offset = this->linear_index[contigID][i].toBlock_entries_offset; - prevHit = i; - } - } else { - if(entry.fromBlock != -1){ - entry.toBlock = this->linear_index[contigID][i - 1].toBlock; - entry.toBlock_entries_offset = this->linear_index[contigID][i - 1].toBlock_entries_offset; - intervals.push_back(entry); - } - entry.reset(); - } - } - - if(entry.fromBlock != -1){ - entry.toBlock = this->linear_index[contigID][i - 1].toBlock; - entry.toBlock_entries_offset = this->linear_index[contigID][i - 1].toBlock_entries_offset; - intervals.push_back(entry); - } - - return(intervals.size() > 0); -} - - -} /* namespace Totempole */ -} /* namespace Tomahawk */ diff --git a/src/totempole/TotempoleOutputSortedIndex.h b/src/totempole/TotempoleOutputSortedIndex.h deleted file mode 100644 index 41e95ab..0000000 --- a/src/totempole/TotempoleOutputSortedIndex.h +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef TOTEMPOLE_TOTEMPOLEOUTPUTSORTEDINDEX_H_ -#define TOTEMPOLE_TOTEMPOLEOUTPUTSORTEDINDEX_H_ - -#include -#include - -#include "../tomahawk/TomahawkOutput/TomahawkOutputEntry.h" -#include "TotempoleContig.h" -#include "TotempoleOutputEntry.h" - -namespace Tomahawk { -namespace Totempole { - -#define TOTEMPOLE_OUTPUT_SORT_CHUNKS 1024 -#define TOTEMPOLE_OUTPUT_SORT_SHIFT 9 - -struct TotempoleOutputSortedIndexBin{ - typedef TotempoleOutputSortedIndexBin self_type; - typedef TotempoleOutputSortedEntry totempole_entry; - - TotempoleOutputSortedIndexBin(void) : - n_chunks(0), - chunks(nullptr) - {} - - TotempoleOutputSortedIndexBin(const U32 n_chunks) : - n_chunks(n_chunks), - chunks(new totempole_entry[n_chunks]) - {} - - ~TotempoleOutputSortedIndexBin(){ - delete [] this->chunks; - } - - bool allocate(const U32 size){ - if(size == 0) - return false; - - if(this->chunks != nullptr) - delete [] this->chunks; - - this->n_chunks = size; - this->chunks = new totempole_entry[size]; - return true; - } - - totempole_entry& operator[](const U32 p){ return(this->chunks[p]); } - friend std::ostream& operator<<(std::ostream& stream, const self_type& self){ - for(U32 i = 0; i < self.n_chunks; ++i) - stream << '\t' << i << '\t' << self.chunks[i] << std::endl; - - return stream; - } - - friend std::ofstream& operator<<(std::ofstream& stream, const self_type& index){ - stream.write(reinterpret_cast(&index.n_chunks), sizeof(U32)); - for(U32 i = 0; i < index.n_chunks; ++i) - stream << index.chunks[i]; - - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& index){ - stream.read(reinterpret_cast(&index.n_chunks), sizeof(U32)); - index.allocate(index.n_chunks); // allocate memory - for(U32 i = 0; i < index.n_chunks; ++i) - stream >> index.chunks[i]; - - return(stream); - } - - U32 n_chunks; - totempole_entry* chunks; -}; - -class TotempoleOutputSortedIndex { - typedef TotempoleOutputSortedIndex self_type; - typedef TotempoleOutputSortedEntry totempole_entry; - typedef IO::TomahawkOutputEntry two_entry; - typedef Totempole::TotempoleContigBase contig_type; - typedef TotempoleOutputSortedIndexBin chunk_type; - -public: - enum TOI_SORTED_ERROR {TOI_SORTED_OK, TOI_SORTED_NO_EXIST, TOI_SORTED_CORRUPTED, TOI_SORTED_INIT}; - -public: - TotempoleOutputSortedIndex(const U32 n_contigs, const contig_type* const contigs); - ~TotempoleOutputSortedIndex(); - - inline chunk_type& operator[](const U32 p){ return(this->linear_index[p]); } - inline U32 getBin(const U32& contigID, const U32& position) const{ return(position / ((this->contigs[contigID].bases >> TOTEMPOLE_OUTPUT_SORT_SHIFT) + 1)); } - inline const TOI_SORTED_ERROR& getState(void) const{ return(this->state); } - - void update(const two_entry& entry, const U32& block, const U32& blockOffset); - - bool findOverlap(const U32 contigID, totempole_entry& intervals); - bool findOverlap(const U32 contigID, const U32 position, totempole_entry& intervals); - bool findOverlap(const U32 contigID, const U32 from, const U32 to, std::vector& intervals); - - friend std::ostream& operator<<(std::ostream& stream, const self_type& self){ - // linear index - for(U32 i = 0; i < self.n_contigs; ++i) - stream << "contig: " << i << "\n" << self.linear_index[i] << std::endl; - - // secondary index - for(U32 i = 0; i < self.n_contigs; ++i) - stream << self.secondary_index[i] << std::endl; - - return stream; - } - - friend std::ofstream& operator<<(std::ofstream& stream, const self_type& index){ - // secondary - for(U32 i = 0; i < index.n_contigs; ++i) - stream << index.secondary_index[i]; - - // linear - for(U32 i = 0; i < index.n_contigs; ++i) - stream << index.linear_index[i]; - - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& index){ - // secondary - for(U32 i = 0; i < index.n_contigs; ++i) - stream >> index.secondary_index[i]; - - // linear - for(U32 i = 0; i < index.n_contigs; ++i) - stream >> index.linear_index[i]; - - index.state = TOI_SORTED_OK; - - return(stream); - } - -public: - TOI_SORTED_ERROR state; - -private: - const U32 n_contigs; - U32 prev_contigIDA; - U32 prev_chunk; - const contig_type* const contigs; - chunk_type* linear_index; - totempole_entry* secondary_index; -}; - -} /* namespace Totempole */ -} /* namespace Tomahawk */ - -#endif /* TOTEMPOLE_TOTEMPOLEOUTPUTSORTEDINDEX_H_ */ diff --git a/src/totempole/TotempoleReader.cpp b/src/totempole/TotempoleReader.cpp deleted file mode 100644 index f167651..0000000 --- a/src/totempole/TotempoleReader.cpp +++ /dev/null @@ -1,265 +0,0 @@ -#include "TotempoleReader.h" - -namespace Tomahawk { -namespace Totempole { - -TotempoleReader::TotempoleReader() : - filesize(0), - n_contigs(0), - contigs(nullptr), - samples(nullptr), - entries(nullptr), - contigsHashTable(nullptr), - sampleHashTable(nullptr) -{} - -TotempoleReader::~TotempoleReader(){ - delete [] this->contigs; - delete [] this->entries; - delete [] this->samples; - - delete this->contigsHashTable; - delete this->sampleHashTable; -} - -bool TotempoleReader::Validate(std::ifstream& in) const{ - char MAGIC[Constants::WRITE_HEADER_MAGIC_INDEX_LENGTH]; - in.read(MAGIC, Constants::WRITE_HEADER_MAGIC_INDEX_LENGTH); - - if(strncmp(MAGIC, Constants::WRITE_HEADER_INDEX_MAGIC, Constants::WRITE_HEADER_MAGIC_INDEX_LENGTH) == 0) - return true; - return false; -} - -bool TotempoleReader::Open(const std::string filename){ - if(filename.size() == 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "IO") << "No input filename..." << std::endl; - return false; - } - - this->filename = filename; - - this->stream.open(this->filename, std::ios::in | std::ios::binary | std::ios::ate); - if(!this->stream.good()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "IO") << "Could not open: " << this->filename << "..." << std::endl; - return false; - } - - this->filesize = this->stream.tellg(); - this->stream.seekg(0); - - if(this->filesize <= 0){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "IO") << "File size is 0..." << std::endl; - return false; - } - - if(this->filesize < Constants::WRITE_HEADER_MAGIC_INDEX_LENGTH){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TOTEMPOLE") << "Failed MAGIC..." << std::endl; - return false; - } - - // Reader header and validate - if(!this->Validate(this->stream)){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TOTEMPOLE") << "Could not validate Totempole header..." << std::endl; - return false; - } - - // Load header data - this->stream >> this->header; -#if DEBUG_MODE == 1 - std::cerr << this->header << std::endl; -#endif - - // Get number of contigs - this->stream.read(reinterpret_cast(&this->n_contigs), sizeof(U32)); -#if DEBUG_MODE == 1 - std::cerr << this->n_contigs << std::endl; -#endif - - this->contigs = new contig_type[this->size()]; - for(U32 i = 0; i < this->size(); ++i){ - contig_base_type* contig_base = reinterpret_cast(&this->contigs[i]); - this->stream >> *contig_base; -#if DEBUG_MODE == 1 - std::cerr << *contig_base << std::endl; -#endif - } - - buffer_type buffer(65536); - this->samples = new std::string[this->getSamples()]; - for(U32 i = 0; i < this->getSamples(); ++i){ - this->stream.read(&buffer.data[0], sizeof(U32)); - const U32 length = *reinterpret_cast(&buffer.data[0]); - this->stream.read(&buffer.data[sizeof(U32)], length); - this->samples[i] = std::string(&buffer.data[sizeof(U32)], length); -#if DEBUG_MODE == 1 - std::cerr << i << '\t' << samples[i] << std::endl; -#endif - } - - // Parse literal block - if(!this->tgzf_controller.InflateBlock(this->stream, buffer)){ - std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to get deflate literal TGZF DATA!" << std::endl; - return false; - } - - this->literals = std::string(&this->tgzf_controller.buffer.data[0], this->tgzf_controller.buffer.size()); -#if DEBUG_MODE == 1 - std::cerr << this->literals << std::endl; -#endif - // end parse literals - - if(this->stream.tellg() != this->header.offset){ - std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Corrupt file" << std::endl; - std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << this->stream.tellg() << '/' << this->header.offset << std::endl; - return false; - } - - // Populate Totempole entries - this->entries = new entry_type[this->getBlocks()]; - for(U32 i = 0; i < this->getBlocks(); ++i){ - this->stream >> this->entries[i]; -#if DEBUG_MODE == 1 - std::cerr << i << '\t' << this->header.blocks << '\t' << this->entries[i] << std::endl; -#endif - } - this->BuildUpdateContigs(); - -#if DEBUG_MODE == 1 - for(U32 i = 0; i < this->size(); ++i) - std::cerr << this->contigs[i] << std::endl; -#endif - - if(!this->ValidateEOF(this->stream)) - return false; - - if(!SILENT){ - std::cerr << Helpers::timestamp("LOG", "TOTEMPOLE") << "Found: " << Helpers::NumberThousandsSeparator(std::to_string(this->getBlocks())) << " blocks..." << std::endl; - std::cerr << Helpers::timestamp("LOG", "TOTEMPOLE") << "Found: " << Helpers::NumberThousandsSeparator(std::to_string(this->size())) << " contigs and " << Helpers::NumberThousandsSeparator(std::to_string(this->getSamples())) << " samples..." << std::endl; - } - - U64 totalEntries = 0; - for(U32 i = 0; i < this->getBlocks(); ++i) - totalEntries += this->entries[i].variants; - - if(!SILENT) - std::cerr << Helpers::timestamp("LOG", "TOTEMPOLE") << "Found: " << Helpers::NumberThousandsSeparator(std::to_string(totalEntries)) << " variants..." << std::endl; - - // Parse - if(!this->BuildHashTables()){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TOTEMPOLE") << "Could not parse Totempole..." << std::endl; - return false; - } - - // Cleanup - buffer.deleteAll(); - - return true; -} - -bool TotempoleReader::ValidateEOF(std::ifstream& in){ - char temp_buffer[Constants::eof_length*sizeof(U64)]; - in.read(&temp_buffer[0], Constants::eof_length*sizeof(U64)); - for(U32 i = 0; i < Constants::eof_length; ++i){ - const U64* eof = reinterpret_cast(&temp_buffer[sizeof(U64)*i]); - - if(*eof != Constants::eof[i]){ - std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Truncated index file!" << std::endl; - return false; - } - } - return true; -} - -void TotempoleReader::BuildUpdateContigs(void){ - // Find boundaries for Totempole blocks - // Master index of indices - // Update contig data - U32 lastContigID = this->entries[0].contigID; - this->contigs[lastContigID].minPosition = this->entries[0].minPosition; - this->contigs[lastContigID].blocksStart = 0; - for(U32 i = 1; i < this->getBlocks(); ++i){ - if(lastContigID != this->entries[i].contigID){ - this->contigs[lastContigID].maxPosition = this->entries[i-1].maxPosition; - this->contigs[lastContigID].blocksEnd = i; - this->contigs[this->entries[i].contigID].minPosition = this->entries[i].minPosition; - this->contigs[this->entries[i].contigID].blocksStart = i; - } - lastContigID = this->entries[i].contigID; - } - const TotempoleEntry& lastEntry = this->entries[this->getBlocks() - 1]; - this->contigs[lastEntry.contigID].blocksEnd = this->getBlocks(); - this->contigs[lastEntry.contigID].maxPosition = lastEntry.maxPosition; -} - -bool TotempoleReader::BuildHashTables(void){ - if(this->size() < 1024) - this->contigsHashTable = new hash_table(1024); - else - this->contigsHashTable = new hash_table(this->size() * 2); - - S32* retValue = 0; - for(U32 i = 0; i < this->size(); ++i){ - if(this->contigsHashTable->GetItem(&this->contigs[i].name[0], &this->contigs[i].name, retValue, this->contigs[i].name.size())){ - std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Duplicated contig! Impossible!" << std::endl; - return false; - } - this->contigsHashTable->SetItem(&this->contigs[i].name[0], &this->contigs[i].name, i, this->contigs[i].name.size()); - } - - if(this->getSamples() < 1024) - this->sampleHashTable = new hash_table(1024); - else - this->sampleHashTable = new hash_table(this->getSamples() * 2); - - retValue = 0; - for(U32 i = 0; i < this->getSamples(); ++i){ - if(this->sampleHashTable->GetItem(&this->samples[i][0], &this->samples[i], retValue, this->samples[i].size())){ - std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Duplicated name! Impossible!" << std::endl; - return false; - } - this->sampleHashTable->SetItem(&this->samples[i][0], &this->samples[i], i, this->samples[i].size()); - } - - return true; -} - -// Find overlaps function using Totempole data -std::vector TotempoleReader::findOverlaps(const Interval& interval) const{ - std::vector ret; - for(U32 i = this->contigs[interval.contigID].blocksStart; i < this->contigs[interval.contigID].blocksEnd; ++i){ - const TotempoleEntry& current = (*this)[i]; - if((interval.from >= current.minPosition && interval.from <= current.maxPosition) || - (interval.to >= current.minPosition && interval.to <= current.maxPosition) || - (interval.from <= current.minPosition && interval.to >= current.maxPosition)) - ret.push_back(i); - - // No need to continue searching as file is ordered - - if(current.minPosition > interval.to){ - std::cerr << "break: " << current.minPosition << ">" << interval.to << std::endl; - break; - } - - - } - - return ret; -} - -bool TotempoleReader::writeLiterals(std::ofstream& stream){ - // Parse literal block - buffer_type buffer(&this->literals[0], this->literals.size()); - - this->tgzf_controller.Clear(); - if(!this->tgzf_controller.Deflate(buffer)){ - std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to get deflate literal TGZF DATA!" << std::endl; - return false; - } - stream.write(&this->tgzf_controller.buffer.data[0], this->tgzf_controller.buffer.pointer); - - return true; -} - -} -} /* namespace Tomahawk */ diff --git a/src/totempole/TotempoleReader.h b/src/totempole/TotempoleReader.h deleted file mode 100644 index 8962b38..0000000 --- a/src/totempole/TotempoleReader.h +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef TOMAHAWK_TOTEMPOLEREADER_H_ -#define TOMAHAWK_TOTEMPOLEREADER_H_ - -#include -#include -#include -#include -#include - -#include "../support/MagicConstants.h" -#include "../support/helpers.h" -#include "../algorithm/OpenHashTable.h" -#include "../io/compression/TGZFController.h" -#include "TotempoleEntry.h" -#include "TotempoleContig.h" -#include "TotempoleHeader.h" - -namespace Tomahawk { -namespace Totempole{ - -// Todo: temp interval -struct Interval{ - Interval(U32 contig, U32 from, U32 to) : contigID(contig), from(from), to(to){} - ~Interval(){} - - U32 contigID; - U32 from; - U32 to; -}; - -class TotempoleReader { - typedef TotempoleReader self_type; - typedef TotempoleHeader header_type; - typedef TotempoleContigBase contig_base_type; - typedef TotempoleContig contig_type; - typedef TotempoleEntry entry_type; - typedef Tomahawk::Hash::HashTable hash_table; - typedef IO::TGZFHeader tgzf_type; - typedef IO::TGZFController tgzf_controller_type; - typedef IO::BasicBuffer buffer_type; - -public: - TotempoleReader(); - ~TotempoleReader(); - - bool Open(const std::string filename); - std::vector findOverlaps(const Interval& interval) const; - const entry_type& front(void) const{ return(this->entries[0]); } - const entry_type& back(void) const{ return(this->entries[this->getBlocks() - 1]); }; - - inline const U32& getLargestBlockSize(void) const{ return this->header.largest_uncompressed; } - inline const U32& getBlocks(void) const{ return this->header.blocks; } - inline const U64& getSamples(void) const{ return this->header.samples; } - inline const U32& getContigs(void) const{ return this->n_contigs; } - inline const U32& size(void) const{ return this->n_contigs; } - inline const entry_type& operator[](const U32 p) const{ return this->entries[p]; } - inline const contig_type& getContig(const U32 contigID) const{ return this->contigs[contigID]; } - inline bool getContig(const std::string& string, S32*& contigID) const{ - if(this->contigsHashTable->GetItem(&string[0], &string, contigID, string.size())) - return true; - - return false; - } - inline const header_type& getHeader(void) const{ return(this->header); } - inline const contig_base_type* getContigBase(const U32 contigID) const{ return(reinterpret_cast(&this->contigs[contigID])); } - inline void addLiteral(const std::string& string){ this->literals += string; } - - hash_table* getContigHTablePointer(void) const{ return(this->contigsHashTable); } - hash_table* getSampleHTablePointer(void) const{ return(this->sampleHashTable); } - bool writeLiterals(std::ofstream& stream); - -private: - bool Validate(std::ifstream& in) const; - void BuildUpdateContigs(void); - bool ValidateEOF(std::ifstream& in); - bool BuildHashTables(void); - -public: - std::ifstream stream; // filestream - std::string filename; // filename - U32 filesize; // filesize - U32 n_contigs; // number of contigs - header_type header; // header information - std::string literals; // literal data - tgzf_controller_type tgzf_controller; // tgzf controller - contig_type* contigs; // contig data - std::string* samples; // sample names - entry_type* entries; // totempole entries data - hash_table* contigsHashTable; // contig name hash table - hash_table* sampleHashTable; // sample name hash table -}; - -} -} /* namespace Tomahawk */ - -#endif /* TOMAHAWK_TOTEMPOLEREADER_H_ */ diff --git a/src/utility.h b/src/utility.h index e28b90b..0c667fd 100644 --- a/src/utility.h +++ b/src/utility.h @@ -10,8 +10,8 @@ std::string Tomahawk::Constants::INTERPRETED_COMMAND; void programMessage(const bool separator = true){ std::cerr << "Program: " << Tomahawk::Constants::PROGRAM_NAME << " " << VERSION << std::endl; - std::cerr << "Contact: Marcus D. R. Klarqvist " << std::endl; - std::cerr << "Documentation: https://github.com/mklarqvist/Tomahawk" << std::endl; + std::cerr << "Contact: Marcus D. R. Klarqvist " << std::endl; + std::cerr << "Documentation: https://github.com/mklarqvist/tomahawk" << std::endl; std::cerr << "License: MIT" << std::endl; if(separator) std::cerr << "----------" << std::endl; } diff --git a/src/view.h b/src/view.h index e50238d..72e83e4 100644 --- a/src/view.h +++ b/src/view.h @@ -22,11 +22,10 @@ DEALINGS IN THE SOFTWARE. */ #include +#include "tomahawk/two/output_filter.h" +#include "tomahawk/two/TomahawkOutputReader.h" #include "utility.h" -#include "totempole/TotempoleReader.h" #include "tomahawk/TomahawkReader.h" -#include "tomahawk/TomahawkOutput/TomahawkOutputFilterController.h" -#include "tomahawk/TomahawkOutput/TomahawkOutputReader.h" void view_usage(void){ programMessage(); @@ -37,7 +36,7 @@ void view_usage(void){ "Usage: " << Tomahawk::Constants::PROGRAM_NAME << " view [options] -i \n\n" "Options:\n" " -i FILE input Tomahawk (required)\n" - " -o FILE output file (- for stdout)\n" + " -o FILE output file (- for stdout; default: -)\n" " -h/H (twk/two) header only / no header [null]\n" " -O char output type: b for TWO format, n for tab-delimited format (default: b)\n" " -N output in tab-delimited text format (see -O) [null]\n" @@ -66,6 +65,7 @@ void view_usage(void){ " -M, --maxMP FLOAT largest model Chi-squared CV (default: inf)\n" " -f INT include FLAG value\n" " -F INT exclude FLAG value\n" + " -u output only the upper triangular values\n" " --min FLOAT smallest cell count (default: 0)\n" " --max FLOAT largest cell count (default: inf)\n"; } @@ -77,32 +77,32 @@ int view(int argc, char** argv){ } static struct option long_options[] = { - {"input", required_argument, 0, 'i' }, - {"output", optional_argument, 0, 'o' }, - {"minP", optional_argument, 0, 'p' }, - {"maxP", optional_argument, 0, 'P' }, - {"minR2", optional_argument, 0, 'r' }, - {"maxR2", optional_argument, 0, 'R' }, - {"minDP", optional_argument, 0, 'd' }, - {"maxDP", optional_argument, 0, 'D' }, - {"minChi", optional_argument, 0, 'x' }, - {"maxChi", optional_argument, 0, 'X' }, - {"minAlelles", optional_argument, 0, 'a' }, - {"maxAlleles", optional_argument, 0, 'A' }, - {"minMP", optional_argument, 0, 'm' }, - {"maxMP", optional_argument, 0, 'M' }, - {"flagInclude", optional_argument, 0, 'f' }, - {"flagExclude", optional_argument, 0, 'F' }, - {"headerOnly", no_argument, 0, 'H' }, - {"noHeader", no_argument, 0, 'h' }, - {"dropGenotypes", optional_argument, 0, 'G' }, - {"silent", no_argument, 0, 's' }, + {"input", required_argument, 0, 'i' }, + {"output", optional_argument, 0, 'o' }, + {"minP", optional_argument, 0, 'p' }, + {"maxP", optional_argument, 0, 'P' }, + {"minR2", optional_argument, 0, 'r' }, + {"maxR2", optional_argument, 0, 'R' }, + {"minDP", optional_argument, 0, 'd' }, + {"maxDP", optional_argument, 0, 'D' }, + {"minChi", optional_argument, 0, 'x' }, + {"maxChi", optional_argument, 0, 'X' }, + {"minAlelles", optional_argument, 0, 'a' }, + {"maxAlleles", optional_argument, 0, 'A' }, + {"minMP", optional_argument, 0, 'm' }, + {"maxMP", optional_argument, 0, 'M' }, + {"flagInclude", optional_argument, 0, 'f' }, + {"flagExclude", optional_argument, 0, 'F' }, + {"headerOnly", no_argument, 0, 'H' }, + {"noHeader", no_argument, 0, 'h' }, + {"dropGenotypes",optional_argument, 0, 'G' }, + {"silent", no_argument, 0, 's' }, {0,0,0,0} }; // Parameter defaults std::string input, output; - Tomahawk::TomahawkOutputFilterController two_filter; + Tomahawk::OutputFilter two_filter; bool outputHeader = true; int outputType = 1; bool dropGenotypes = false; @@ -329,8 +329,8 @@ int view(int argc, char** argv){ Tomahawk::TomahawkReader tomahawk; tomahawk.setDropGenotypes(dropGenotypes); tomahawk.setShowHeader(outputHeader); - if(!tomahawk.Open(input)){ - std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Failed build!" << std::endl; + if(!tomahawk.open(input)){ + std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Failed to open!" << std::endl; return 1; } @@ -338,25 +338,27 @@ int view(int argc, char** argv){ tomahawk.outputBlocks(); } else if(end == Tomahawk::Constants::OUTPUT_LD_SUFFIX){ - Tomahawk::IO::TomahawkOutputReader reader; + Tomahawk::TomahawkOutputReader reader; reader.setWriteHeader(outputHeader); - Tomahawk::TomahawkOutputFilterController& filter = reader.getFilter(); - filter = Tomahawk::TomahawkOutputFilterController(two_filter); // use copy ctor to transfer data + Tomahawk::OutputFilter& filter = reader.getFilter(); + filter = Tomahawk::OutputFilter(two_filter); // use copy ctor to transfer data - if(!reader.setWriterType(outputType)) - return 1; - if(!reader.Open(input)) + //if(!reader.setWriterType(outputType)) + // return 1; + + if(!reader.open(input)) return 1; - if(!reader.AddRegions(filter_regions)){ + if(!reader.addRegions(filter_regions)){ std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Failed to add region!" << std::endl; return 1; } - if(!reader.view(input)) + if(!reader.view()) return 1; + } else { std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Unrecognised input file format: " << input << std::endl; return 1; diff --git a/test_files/import/bcf/1kgp3_chr20_subset_20k.bcf b/test_files/import/bcf/1kgp3_chr20_subset_20k.bcf deleted file mode 100644 index 9fbc71a..0000000 Binary files a/test_files/import/bcf/1kgp3_chr20_subset_20k.bcf and /dev/null differ diff --git a/test_files/import/bcf/1kgp3_chrY_10k.bcf b/test_files/import/bcf/1kgp3_chrY_10k.bcf deleted file mode 100644 index e77acd2..0000000 Binary files a/test_files/import/bcf/1kgp3_chrY_10k.bcf and /dev/null differ diff --git a/test_files/view/1kgp3_chr20_subset.twk b/test_files/view/1kgp3_chr20_subset.twk new file mode 100644 index 0000000..dca5d4d Binary files /dev/null and b/test_files/view/1kgp3_chr20_subset.twk differ diff --git a/test_files/view/1kgp3_chr20_subset.two b/test_files/view/1kgp3_chr20_subset.two new file mode 100644 index 0000000..781324f Binary files /dev/null and b/test_files/view/1kgp3_chr20_subset.two differ diff --git a/test_files/view/1kgp3_chr20_subset_1000lines.ld b/test_files/view/1kgp3_chr20_subset_1000lines.ld new file mode 100644 index 0000000..63c5a75 --- /dev/null +++ b/test_files/view/1kgp3_chr20_subset_1000lines.ld @@ -0,0 +1,1000 @@ +##fileformat=VCFv4.1 +##FILTER= +##fileDate=20150218 +##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz +##source=1000GenomesPhase3Pipeline +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##bcftools_viewVersion=1.6+htslib-1.6 +##bcftools_viewCommand=view -O b -o 1kgp3_chr20.bcf ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz; Date=Thu Feb 1 15:40:09 2018 +##bcftools_viewCommand=view 1kgp3_chr20.bcf; Date=Thu Feb 15 12:15:30 2018 +##bcftools_viewCommand=view -O b -o 1kgp3_chr20_subset.bcf; Date=Thu Feb 15 12:15:30 2018 +##tomahawk_importCommand=tomahawk import -i 1kgp3_chr20_subset.bcf -o 1kgp3_chr20_subset; VERSION=; Date=2018-02-15 12:16:31,154; SIMD=AVX2-256 +##tomahawk_calcCommand=tomahawk calc -pi 1kgp3_chr20_subset.twk -o 1kgp3_chr20_subset; VERSION=; Date=2018-02-15 12:17:31,661; SIMD=AVX2-256 +##tomahawk_calcInterpretedCommand=minR2=0.099000 maxR2=1.001000 minP=0.000100 minMHF=5 maxMHF=9223372036854775807 partStart=0 parts=1 threads=8 compression=1 force_type=1##tomahawk_calcCommand=tomahawk calc -pi 1kgp3_chr20_subset.twk -o 1kgp3_chr20_subset; VERSION=; Date=2018-02-15 12:17:31,661; SIMD=AVX2-256 +##tomahawk_viewCommand=tomahawk view -i 1kgp3_chr20_subset.two; VERSION=; Date=2018-02-15 12:18:18,712; SIMD=AVX2-256 +##tomahawk_viewFilters=no_filter filter=NO regions=NO +17 20 100495 20 140467 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 140467 20 100495 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 100495 20 141656 4537 32 342 97 0.017111002 0.72810364 0.3819451 0.14588206 1.4769359e-78 730.52764 0 +17 20 141656 20 100495 4537 32 342 97 0.017111002 0.72810364 0.3819451 0.14588206 1.4769359e-78 730.52764 0 +17 20 100495 20 144279 4546 23 349 90 0.015993301 0.77690363 0.38080999 0.14501625 9.4276146e-77 726.18475 0 +17 20 144279 20 100495 4546 23 349 90 0.015993301 0.77690363 0.38080999 0.14501625 9.4276146e-77 726.18475 0 +17 20 100495 20 144551 4481 88 287 152 0.026150491 0.59810317 0.4329043 0.18740612 4.1880188e-206 938.50261 0 +17 20 144551 20 100495 4481 88 287 152 0.026150491 0.59810317 0.4329043 0.18740612 4.1880188e-206 938.50261 0 +17 20 100495 20 146274 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 146274 20 100495 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 100495 20 149381 4551 18 346 93 0.016627351 0.82225692 0.39937633 0.15950145 1.9940477e-83 798.72558 0 +17 20 149381 20 100495 4551 18 346 93 0.016627351 0.82225692 0.39937633 0.15950145 1.9940477e-83 798.72558 0 +17 20 100495 20 149599 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 149599 20 100495 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 100495 20 151677 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 151677 20 100495 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 100495 20 153571 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 153571 20 100495 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 100495 20 153572 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 153572 20 100495 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 100495 20 155621 4486 83 275 164 0.028424133 0.63168091 0.46416828 0.21545218 1.2369172e-236 1078.9577 0 +17 20 155621 20 100495 4486 83 275 164 0.028424133 0.63168091 0.46416828 0.21545218 1.2369172e-236 1078.9577 0 +17 20 100495 20 156111 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 156111 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100495 20 156142 4159 410 127 312 0.049662471 0.661973 0.4999418 0.2499418 3.5478779e-274 1251.6985 0 +17 20 156142 20 100495 4159 410 127 312 0.049662471 0.661973 0.4999418 0.2499418 3.5478779e-274 1251.6985 0 +17 20 100495 20 156299 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 156299 20 100495 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 100495 20 156300 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 156300 20 100495 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 100495 20 156454 4525 44 352 87 0.015079188 0.63185018 0.33408096 0.11161009 1.4151123e-62 558.89433 0 +17 20 156454 20 100495 4525 44 352 87 0.015079188 0.63185018 0.33408096 0.11161009 1.4151123e-62 558.89433 0 +17 20 100495 20 156532 4552 17 353 86 0.015369617 0.81909317 0.38292235 0.14662954 1.0958227e-76 734.25882 0 +17 20 156532 20 100495 4552 17 353 86 0.015369617 0.81909317 0.38292235 0.14662954 1.0958227e-76 734.25882 0 +17 20 100495 20 156755 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 156755 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100495 20 156966 4319 250 276 163 0.025318796 0.33651194 0.32547009 0.10593078 2.2155723e-117 530.48476 0 +17 20 156966 20 100495 4319 250 276 163 0.025318796 0.33651194 0.32547009 0.10593078 2.2155723e-117 530.48476 0 +17 20 100495 20 157523 4559 10 380 59 0.010573379 0.84114748 0.32074177 0.10287529 1.5529809e-53 515.10751 0 +17 20 157523 20 100495 4559 10 380 59 0.010573379 0.84114748 0.32074177 0.10287529 1.5529809e-53 515.10751 0 +17 20 100495 20 158264 4495 74 318 121 0.020748073 0.58405083 0.37926111 0.143839 1.1448139e-158 720.31237 0 +17 20 158264 20 100495 4495 74 318 121 0.020748073 0.58405083 0.37926111 0.143839 1.1448139e-158 720.31237 0 +17 20 100495 20 158498 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 158498 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100495 20 158746 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 158746 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100495 20 158984 4523 46 353 86 0.014862004 0.61803192 0.32805324 0.10761894 9.3582997e-61 538.90699 0 +17 20 158984 20 100495 4523 46 353 86 0.014862004 0.61803192 0.32805324 0.10761894 9.3582997e-61 538.90699 0 +17 20 100495 20 159524 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +17 20 159524 20 100495 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +145 20 100495 20 159745 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 159745 20 100495 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +17 20 100495 20 159819 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 159819 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +145 20 100495 20 160216 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 160216 20 100495 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +17 20 100495 20 161586 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +17 20 161586 20 100495 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +145 20 100495 20 162102 4101 468 155 284 0.0435463 0.58453935 0.43104967 0.18580382 2.3041899e-204 930.49582 0 +145 20 162102 20 100495 4101 468 155 284 0.0435463 0.58453935 0.43104967 0.18580382 2.3041899e-204 930.49582 0 +17 20 100495 20 162291 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +17 20 162291 20 100495 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +145 20 100495 20 162292 4136 433 231 208 0.030313518 0.39656761 0.32084963 0.10294448 3.962886e-114 515.53481 0 +145 20 162292 20 100495 4136 433 231 208 0.030313518 0.39656761 0.32084963 0.10294448 3.962886e-114 515.53481 0 +17 20 100495 20 162684 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +17 20 162684 20 100495 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +17 20 100495 20 162810 4551 18 349 90 0.016080821 0.81731963 0.39145702 0.15323861 3.2565827e-80 767.35972 0 +17 20 162810 20 100495 4551 18 349 90 0.016080821 0.81731963 0.39145702 0.15323861 3.2565827e-80 767.35972 0 +17 20 100495 20 163483 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 163483 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100495 20 164595 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +17 20 164595 20 100495 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +145 20 100495 20 165124 4116 453 230 209 0.030145615 0.39627671 0.3147285 0.099054039 6.8710753e-110 496.0518 0 +145 20 165124 20 100495 4116 453 230 209 0.030145615 0.39627671 0.3147285 0.099054039 6.8710753e-110 496.0518 0 +17 20 100495 20 165125 4552 17 362 77 0.013730028 0.80177224 0.35774675 0.12798274 3.4560954e-67 640.86991 0 +17 20 165125 20 100495 4552 17 362 77 0.013730028 0.80177224 0.35774675 0.12798274 3.4560954e-67 640.86991 0 +17 20 100495 20 165433 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 165433 20 100495 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +145 20 100495 20 165479 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +145 20 165479 20 100495 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +17 20 100495 20 165704 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 165704 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +145 20 100495 20 165802 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 165802 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100495 20 166163 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 166163 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100495 20 166188 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +145 20 166188 20 100495 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +145 20 100495 20 167218 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167218 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +17 20 100495 20 167301 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 167301 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +145 20 100495 20 167325 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167325 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100495 20 167441 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167441 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100495 20 167455 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167455 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +17 20 100495 20 167599 4565 4 376 63 0.011407108 0.93456227 0.35108843 0.1232631 1.9270085e-63 617.20699 0 +17 20 167599 20 100495 4565 4 376 63 0.011407108 0.93456227 0.35108843 0.1232631 1.9270085e-63 617.20699 0 +17 20 100495 20 167643 4551 18 348 91 0.016262997 0.81899559 0.39411175 0.15532409 2.7776987e-81 777.80441 0 +17 20 167643 20 100495 4551 18 348 91 0.016262997 0.81899559 0.39411175 0.15532409 2.7776987e-81 777.80441 0 +145 20 100495 20 167647 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167647 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100495 20 167774 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 167774 20 100495 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 100495 20 167900 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 167900 20 100495 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 100495 20 168020 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 168020 20 100495 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +17 20 100495 20 169131 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 169131 20 100495 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 100495 20 170114 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 170114 20 100495 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 100495 20 170805 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 170805 20 100495 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100505 20 140467 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 140467 20 100505 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 100505 20 141656 4537 32 342 97 0.017111002 0.72810364 0.3819451 0.14588206 1.4769359e-78 730.52764 0 +17 20 141656 20 100505 4537 32 342 97 0.017111002 0.72810364 0.3819451 0.14588206 1.4769359e-78 730.52764 0 +17 20 100505 20 144279 4546 23 349 90 0.015993301 0.77690363 0.38080999 0.14501625 9.4276146e-77 726.18475 0 +17 20 144279 20 100505 4546 23 349 90 0.015993301 0.77690363 0.38080999 0.14501625 9.4276146e-77 726.18475 0 +17 20 100505 20 144551 4481 88 287 152 0.026150491 0.59810317 0.4329043 0.18740612 4.1880188e-206 938.50261 0 +17 20 144551 20 100505 4481 88 287 152 0.026150491 0.59810317 0.4329043 0.18740612 4.1880188e-206 938.50261 0 +17 20 100505 20 146274 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 146274 20 100505 4561 8 379 60 0.010790562 0.87104905 0.32969466 0.10869858 3.0379365e-56 544.26934 0 +17 20 100505 20 149381 4551 18 346 93 0.016627351 0.82225692 0.39937633 0.15950145 1.9940477e-83 798.72558 0 +17 20 149381 20 100505 4551 18 346 93 0.016627351 0.82225692 0.39937633 0.15950145 1.9940477e-83 798.72558 0 +17 20 100505 20 149599 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 149599 20 100505 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 100505 20 151677 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 151677 20 100505 4552 17 346 93 0.016644854 0.83060551 0.40156889 0.16125758 3.4765175e-84 807.5198 0 +17 20 100505 20 153571 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 153571 20 100505 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 100505 20 153572 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 153572 20 100505 4560 9 379 60 0.010773058 0.85703266 0.32679901 0.1067976 2.1533657e-55 534.75056 0 +17 20 100505 20 155621 4486 83 275 164 0.028424133 0.63168091 0.46416828 0.21545218 1.2369172e-236 1078.9577 0 +17 20 155621 20 100505 4486 83 275 164 0.028424133 0.63168091 0.46416828 0.21545218 1.2369172e-236 1078.9577 0 +17 20 100505 20 156111 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 156111 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100505 20 156142 4159 410 127 312 0.049662471 0.661973 0.4999418 0.2499418 3.5478779e-274 1251.6985 0 +17 20 156142 20 100505 4159 410 127 312 0.049662471 0.661973 0.4999418 0.2499418 3.5478779e-274 1251.6985 0 +17 20 100505 20 156299 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 156299 20 100505 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 100505 20 156300 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 156300 20 100505 4534 35 323 116 0.020519843 0.74594116 0.42431396 0.18004234 1.0918409e-96 901.60937 0 +17 20 100505 20 156454 4525 44 352 87 0.015079188 0.63185018 0.33408096 0.11161009 1.4151123e-62 558.89433 0 +17 20 156454 20 100505 4525 44 352 87 0.015079188 0.63185018 0.33408096 0.11161009 1.4151123e-62 558.89433 0 +17 20 100505 20 156532 4552 17 353 86 0.015369617 0.81909317 0.38292235 0.14662954 1.0958227e-76 734.25882 0 +17 20 156532 20 100505 4552 17 353 86 0.015369617 0.81909317 0.38292235 0.14662954 1.0958227e-76 734.25882 0 +17 20 100505 20 156755 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 156755 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100505 20 156966 4319 250 276 163 0.025318796 0.33651194 0.32547009 0.10593078 2.2155723e-117 530.48476 0 +17 20 156966 20 100505 4319 250 276 163 0.025318796 0.33651194 0.32547009 0.10593078 2.2155723e-117 530.48476 0 +17 20 100505 20 157523 4559 10 380 59 0.010573379 0.84114748 0.32074177 0.10287529 1.5529809e-53 515.10751 0 +17 20 157523 20 100505 4559 10 380 59 0.010573379 0.84114748 0.32074177 0.10287529 1.5529809e-53 515.10751 0 +17 20 100505 20 158264 4495 74 318 121 0.020748073 0.58405083 0.37926111 0.143839 1.1448139e-158 720.31237 0 +17 20 158264 20 100505 4495 74 318 121 0.020748073 0.58405083 0.37926111 0.143839 1.1448139e-158 720.31237 0 +17 20 100505 20 158498 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 158498 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100505 20 158746 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 158746 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100505 20 158984 4523 46 353 86 0.014862004 0.61803192 0.32805324 0.10761894 9.3582997e-61 538.90699 0 +17 20 158984 20 100505 4523 46 353 86 0.014862004 0.61803192 0.32805324 0.10761894 9.3582997e-61 538.90699 0 +17 20 100505 20 159524 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +17 20 159524 20 100505 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +145 20 100505 20 159745 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 159745 20 100505 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +17 20 100505 20 159819 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 159819 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +145 20 100505 20 160216 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 160216 20 100505 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +17 20 100505 20 161586 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +17 20 161586 20 100505 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +145 20 100505 20 162102 4101 468 155 284 0.0435463 0.58453935 0.43104967 0.18580382 2.3041899e-204 930.49582 0 +145 20 162102 20 100505 4101 468 155 284 0.0435463 0.58453935 0.43104967 0.18580382 2.3041899e-204 930.49582 0 +17 20 100505 20 162291 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +17 20 162291 20 100505 4524 45 353 86 0.014879507 0.62348312 0.32965702 0.10867374 3.5123827e-61 544.18915 0 +145 20 100505 20 162292 4136 433 231 208 0.030313518 0.39656761 0.32084963 0.10294448 3.962886e-114 515.53481 0 +145 20 162292 20 100505 4136 433 231 208 0.030313518 0.39656761 0.32084963 0.10294448 3.962886e-114 515.53481 0 +17 20 100505 20 162684 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +17 20 162684 20 100505 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +17 20 100505 20 162810 4551 18 349 90 0.016080821 0.81731963 0.39145702 0.15323861 3.2565827e-80 767.35972 0 +17 20 162810 20 100505 4551 18 349 90 0.016080821 0.81731963 0.39145702 0.15323861 3.2565827e-80 767.35972 0 +17 20 100505 20 163483 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 163483 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 100505 20 164595 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +17 20 164595 20 100505 4554 15 364 75 0.013400682 0.81731957 0.35669512 0.12723142 1.6005073e-66 637.10435 0 +145 20 100505 20 165124 4116 453 230 209 0.030145615 0.39627671 0.3147285 0.099054039 6.8710753e-110 496.0518 0 +145 20 165124 20 100505 4116 453 230 209 0.030145615 0.39627671 0.3147285 0.099054039 6.8710753e-110 496.0518 0 +17 20 100505 20 165125 4552 17 362 77 0.013730028 0.80177224 0.35774675 0.12798274 3.4560954e-67 640.86991 0 +17 20 165125 20 100505 4552 17 362 77 0.013730028 0.80177224 0.35774675 0.12798274 3.4560954e-67 640.86991 0 +17 20 100505 20 165433 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 165433 20 100505 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +145 20 100505 20 165479 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +145 20 165479 20 100505 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +17 20 100505 20 165704 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 165704 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +145 20 100505 20 165802 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 165802 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100505 20 166163 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 166163 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100505 20 166188 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +145 20 166188 20 100505 4105 464 155 284 0.043616317 0.58492947 0.43269232 0.18722264 6.5760166e-206 937.60117 0 +145 20 100505 20 167218 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167218 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +17 20 100505 20 167301 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 167301 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +145 20 100505 20 167325 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167325 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100505 20 167441 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167441 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100505 20 167455 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167455 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +17 20 100505 20 167599 4565 4 376 63 0.011407108 0.93456227 0.35108843 0.1232631 1.9270085e-63 617.20699 0 +17 20 167599 20 100505 4565 4 376 63 0.011407108 0.93456227 0.35108843 0.1232631 1.9270085e-63 617.20699 0 +17 20 100505 20 167643 4551 18 348 91 0.016262997 0.81899559 0.39411175 0.15532409 2.7776987e-81 777.80441 0 +17 20 167643 20 100505 4551 18 348 91 0.016262997 0.81899559 0.39411175 0.15532409 2.7776987e-81 777.80441 0 +145 20 100505 20 167647 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 167647 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 100505 20 167774 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 167774 20 100505 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 100505 20 167900 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 167900 20 100505 4119 450 230 209 0.030198127 0.39669317 0.31588456 0.099783055 1.1032238e-110 499.70267 0 +145 20 100505 20 168020 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +145 20 168020 20 100505 4120 449 230 209 0.03021563 0.39683187 0.31627136 0.10002758 5.9733264e-111 500.92727 0 +17 20 100505 20 169131 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 169131 20 100505 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 100505 20 170114 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 170114 20 100505 4524 45 352 87 0.015061685 0.62633562 0.33246088 0.11053023 3.854968e-62 553.4867 0 +17 20 100505 20 170805 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +17 20 170805 20 100505 4560 9 380 59 0.010590882 0.85493022 0.32359365 0.10471284 2.4348223e-54 524.30878 0 +789 20 100582 20 148998 4996 0 6 6 0.0011952122 1 0.70668256 0.49940023 4.2084374e-17 2500.9964 0 +789 20 148998 20 100582 4996 0 6 6 0.0011952122 1 0.70668256 0.49940023 4.2084374e-17 2500.9964 0 +17 20 100699 20 144535 2659 1352 269 728 0.062681817 0.53852272 0.31855109 0.10147481 1.5757669e-112 508.18324 0 +17 20 144535 20 100699 2659 1352 269 728 0.062681817 0.53852272 0.31855109 0.10147481 1.5757669e-112 508.18324 0 +17 20 100699 20 144560 2869 1142 335 662 0.060474642 0.47480389 0.31547397 0.09952382 2.1054702e-110 498.41264 0 +17 20 144560 20 100699 2869 1142 335 662 0.060474642 0.47480389 0.31547397 0.09952382 2.1054702e-110 498.41264 0 +17 20 100699 20 145413 2013 1998 93 904 0.065148868 0.77818346 0.33050933 0.10923643 5.5076759e-121 547.05354 0 +17 20 145413 20 100699 2013 1998 93 904 0.065148868 0.77818346 0.33050933 0.10923643 5.5076759e-121 547.05354 0 +17 20 100699 20 149407 1978 2033 102 895 0.062318183 0.75367641 0.31670311 0.10030085 2.9967682e-111 502.30407 0 +17 20 149407 20 100699 1978 2033 102 895 0.062318183 0.75367641 0.31670311 0.10030085 2.9967682e-111 502.30407 0 +145 20 100699 20 149975 1935 2076 72 925 0.06540665 0.81980026 0.3342489 0.11172232 1.0784247e-123 559.50274 0 +145 20 149975 20 100699 1935 2076 72 925 0.06540665 0.81980026 0.3342489 0.11172232 1.0784247e-123 559.50274 0 +17 20 100699 20 151330 1971 2040 102 895 0.062039912 0.75284469 0.31544396 0.099504888 2.2078111e-110 498.3179 0 +17 20 151330 20 100699 1971 2040 102 895 0.062039912 0.75284469 0.31544396 0.099504888 2.2078111e-110 498.3179 0 +17 20 100699 20 153393 2270 1741 152 845 0.065929577 0.684762 0.33039519 0.10916097 6.6555322e-121 546.67562 0 +17 20 153393 20 100699 2270 1741 152 845 0.065929577 0.684762 0.33039519 0.10916097 6.6555322e-121 546.67562 0 +17 20 100699 20 155955 2518 1493 174 823 0.072269827 0.67532903 0.3629986 0.13176799 1.5772802e-145 659.8916 0 +17 20 155955 20 100699 2518 1493 174 823 0.072269827 0.67532903 0.3629986 0.13176799 1.5772802e-145 659.8916 0 +17 20 100699 20 156652 2507 1504 183 814 0.070393205 0.65828198 0.35355148 0.12499864 3.7226359e-138 625.99052 0 +17 20 156652 20 100699 2507 1504 183 814 0.070393205 0.65828198 0.35355148 0.12499864 3.7226359e-138 625.99052 0 +17 20 100699 20 158315 3266 745 440 557 0.059464037 0.40362912 0.33950871 0.11526617 1.4869816e-127 577.24981 0 +17 20 158315 20 100699 3266 745 440 557 0.059464037 0.40362912 0.33950871 0.11526617 1.4869816e-127 577.24981 0 +17 20 100699 20 160533 2522 1489 189 808 0.070029922 0.64981186 0.35195985 0.12387573 6.2214777e-137 620.3672 0 +17 20 160533 20 100699 2522 1489 189 808 0.070029922 0.64981186 0.35195985 0.12387573 6.2214777e-137 620.3672 0 +17 20 100699 20 162683 2521 1490 189 808 0.069990166 0.64968258 0.35174838 0.12372692 9.0359277e-137 619.62199 0 +17 20 162683 20 100699 2521 1490 189 808 0.069990166 0.64968258 0.35174838 0.12372692 9.0359277e-137 619.62199 0 +17 20 100699 20 166727 2522 1489 189 808 0.070029922 0.64981186 0.35195985 0.12387573 6.2214777e-137 620.3672 0 +17 20 166727 20 100699 2522 1489 189 808 0.070029922 0.64981186 0.35195985 0.12387573 6.2214777e-137 620.3672 0 +17 20 100699 20 166824 2520 1491 189 808 0.069950417 0.6495533 0.35153702 0.12357828 1.3118996e-136 618.87748 0 +17 20 166824 20 100699 2520 1491 189 808 0.069950417 0.6495533 0.35153702 0.12357828 1.3118996e-136 618.87748 0 +17 20 100699 20 167310 2522 1489 189 808 0.070029922 0.64981186 0.35195985 0.12387573 6.2214777e-137 620.3672 0 +17 20 167310 20 100699 2522 1489 189 808 0.070029922 0.64981186 0.35195985 0.12387573 6.2214777e-137 620.3672 0 +17 20 100699 20 168080 2394 1617 167 830 0.068459988 0.67245108 0.34298077 0.11763581 3.8977881e-130 589.11772 0 +17 20 168080 20 100699 2394 1617 167 830 0.068459988 0.67245108 0.34298077 0.11763581 3.8977881e-130 589.11772 0 +17 20 100699 20 169701 2520 1491 189 808 0.069950417 0.6495533 0.35153702 0.12357828 1.3118996e-136 618.87748 0 +17 20 169701 20 100699 2520 1491 189 808 0.069950417 0.6495533 0.35153702 0.12357828 1.3118996e-136 618.87748 0 +277 20 100767 20 148529 4897 78 0 33 0.0064434046 1 0.54095858 0.29263616 1.4028201e-57 1465.5219 0 +277 20 148529 20 100767 4897 78 0 33 0.0064434046 1 0.54095858 0.29263616 1.4028201e-57 1465.5219 0 +277 20 100767 20 152686 4897 78 0 33 0.0064434046 1 0.54095858 0.29263616 1.4028201e-57 1465.5219 0 +277 20 152686 20 100767 4897 78 0 33 0.0064434046 1 0.54095858 0.29263616 1.4028201e-57 1465.5219 0 +277 20 100767 20 154631 4898 77 0 33 0.006444721 1.0000001 0.5434674 0.29535684 9.8576549e-58 1479.1468 0 +277 20 154631 20 100767 4898 77 0 33 0.006444721 1.0000001 0.5434674 0.29535684 9.8576549e-58 1479.1468 0 +277 20 100767 20 157675 4886 89 0 33 0.0064289309 1 0.51541525 0.26565287 5.39036e-56 1330.3895 0 +277 20 157675 20 100767 4886 89 0 33 0.0064289309 1 0.51541525 0.26565287 5.39036e-56 1330.3895 0 +277 20 100767 20 158333 4887 88 0 33 0.0064302464 0.99999994 0.51759356 0.26790312 3.9323118e-56 1341.659 0 +277 20 158333 20 100767 4887 88 0 33 0.0064302464 0.99999994 0.51759356 0.26790312 3.9323118e-56 1341.659 0 +277 20 100767 20 158385 4887 88 0 33 0.0064302464 0.99999994 0.51759356 0.26790312 3.9323118e-56 1341.659 0 +277 20 158385 20 100767 4887 88 0 33 0.0064302464 0.99999994 0.51759356 0.26790312 3.9323118e-56 1341.659 0 +277 20 100767 20 162056 4887 88 0 33 0.0064302464 0.99999994 0.51759356 0.26790312 3.9323118e-56 1341.659 0 +277 20 162056 20 100767 4887 88 0 33 0.0064302464 0.99999994 0.51759356 0.26790312 3.9323118e-56 1341.659 0 +273 20 100767 20 167026 4887 88 1 32 0.0062318821 0.96895307 0.5036608 0.25367418 5.2679811e-53 1269.7481 0 +273 20 167026 20 100767 4887 88 1 32 0.0062318821 0.96895307 0.5036608 0.25367418 5.2679811e-53 1269.7481 0 +81 20 100904 20 141092 4810 109 4 85 0.01628441 0.95324504 0.63872415 0.40796849 1.4228168e-123 2042.9525 0 +81 20 141092 20 100904 4810 109 4 85 0.01628441 0.95324504 0.63872415 0.40796849 1.4228168e-123 2042.9525 0 +81 20 100904 20 149381 4890 29 7 82 0.015979905 0.91956556 0.82156676 0.67497194 1.0896518e-144 3379.9952 0 +81 20 149381 20 100904 4890 29 7 82 0.015979905 0.91956556 0.82156676 0.67497194 1.0896518e-144 3379.9952 0 +81 20 100904 20 149599 4891 28 7 82 0.015983451 0.91958183 0.82539159 0.68127131 2.8508629e-145 3411.5409 0 +81 20 149599 20 100904 4891 28 7 82 0.015983451 0.91958183 0.82539159 0.68127131 2.8508629e-145 3411.5409 0 +81 20 100904 20 151677 4891 28 7 82 0.015983451 0.91958183 0.82539159 0.68127131 2.8508629e-145 3411.5409 0 +81 20 151677 20 100904 4891 28 7 82 0.015983451 0.91958183 0.82539159 0.68127131 2.8508629e-145 3411.5409 0 +81 20 100904 20 154403 4739 180 8 81 0.015247928 0.9051702 0.51924962 0.26962015 1.0659544e-99 1350.142 0 +81 20 154403 20 100904 4739 180 8 81 0.015247928 0.9051702 0.51924962 0.26962015 1.0659544e-99 1350.142 0 +81 20 100904 20 154809 4739 180 8 81 0.015247928 0.9051702 0.51924962 0.26962015 1.0659544e-99 1350.142 0 +81 20 154809 20 100904 4739 180 8 81 0.015247928 0.9051702 0.51924962 0.26962015 1.0659544e-99 1350.142 0 +209 20 100904 20 155449 4595 324 8 81 0.014736925 0.90220356 0.40912318 0.16738178 4.9935135e-82 838.17094 0 +209 20 155449 20 100904 4595 324 8 81 0.014736925 0.90220356 0.40912318 0.16738178 4.9935135e-82 838.17094 0 +81 20 100904 20 155732 4754 165 8 81 0.015301158 0.90546882 0.53586727 0.28715375 3.4329279e-102 1437.9436 0 +81 20 155732 20 100904 4754 165 8 81 0.015301158 0.90546882 0.53586727 0.28715375 3.4329279e-102 1437.9436 0 +81 20 100904 20 156532 4886 33 19 70 0.013612127 0.78203398 0.72591048 0.52694607 1.9522114e-113 2638.6619 0 +81 20 156532 20 100904 4886 33 19 70 0.013612127 0.78203398 0.72591048 0.52694607 1.9522114e-113 2638.6619 0 +81 20 100904 20 158744 4755 164 9 80 0.015108574 0.89369708 0.53117532 0.28214723 4.0381919e-100 1412.87 0 +81 20 158744 20 100904 4755 164 9 80 0.015108574 0.89369708 0.53117532 0.28214723 4.0381919e-100 1412.87 0 +81 20 100904 20 160207 4753 166 9 80 0.015101478 0.89365244 0.52887422 0.27970794 8.8526753e-100 1400.6549 0 +81 20 160207 20 100904 4753 166 9 80 0.015101478 0.89365244 0.52887422 0.27970794 8.8526753e-100 1400.6549 0 +81 20 100904 20 162810 4890 29 10 79 0.015391508 0.8851639 0.80198544 0.64318061 7.1061094e-137 3220.7775 0 +81 20 162810 20 100904 4890 29 10 79 0.015391508 0.8851639 0.80198544 0.64318061 7.1061094e-137 3220.7775 0 +81 20 100904 20 167599 4915 4 26 63 0.012342113 0.93921828 0.81309241 0.66111928 3.7910534e-118 3310.4525 0 +81 20 167599 20 100904 4915 4 26 63 0.012342113 0.93921828 0.81309241 0.66111928 3.7910534e-118 3310.4525 0 +81 20 100904 20 167643 4890 29 9 80 0.015587639 0.89662635 0.80855316 0.65375823 1.975768e-139 3273.753 0 +81 20 167643 20 100904 4890 29 9 80 0.015587639 0.89662635 0.80855316 0.65375823 1.975768e-139 3273.753 0 +789 20 101147 20 151750 4999 7 0 2 0.00039864331 1 0.47107482 0.22191148 2.8569015e-06 1111.3327 0 +789 20 151750 20 101147 4999 7 0 2 0.00039864331 1 0.47107482 0.22191148 2.8569015e-06 1111.3327 0 +789 20 101147 20 168457 4997 9 0 2 0.00039848383 1 0.42601794 0.1814913 4.3647234e-06 908.90844 0 +789 20 168457 20 101147 4997 9 0 2 0.00039848383 1 0.42601794 0.1814913 4.3647234e-06 908.90844 0 +17 20 101355 20 144279 4816 55 79 58 0.010964206 0.49958485 0.45260665 0.20485279 4.5814491e-225 1025.7326 0 +17 20 144279 20 101355 4816 55 79 58 0.010964206 0.49958485 0.45260665 0.20485279 4.5814491e-225 1025.7326 0 +17 20 101355 20 144567 4798 73 85 52 0.009700573 0.39957461 0.38120475 0.14531706 2.9883221e-160 727.59374 0 +17 20 144567 20 101355 4798 73 85 52 0.009700573 0.39957461 0.38120475 0.14531706 2.9883221e-160 727.59374 0 +17 20 101355 20 148597 4817 54 85 52 0.0098043606 0.47623786 0.41757929 0.17437246 6.9950864e-192 873.07637 0 +17 20 148597 20 101355 4817 54 85 52 0.0098043606 0.47623786 0.41757929 0.17437246 6.9950864e-192 873.07637 0 +17 20 101355 20 148801 4797 74 85 52 0.0096951108 0.39618018 0.3795141 0.14403094 7.511459e-159 721.154 0 +17 20 148801 20 101355 4797 74 85 52 0.0096951108 0.39618018 0.3795141 0.14403094 7.511459e-159 721.154 0 +17 20 101355 20 153154 4796 75 85 52 0.0096896486 0.39283919 0.37784272 0.14276512 1.7945108e-157 714.81586 0 +17 20 153154 20 101355 4796 75 85 52 0.0096896486 0.39283919 0.37784272 0.14276512 1.7945108e-157 714.81586 0 +17 20 101355 20 154241 4493 378 26 111 0.019493369 0.78968275 0.40259707 0.1620844 2.800513e-91 811.67623 0 +17 20 154241 20 101355 4493 378 26 111 0.019493369 0.78968275 0.40259707 0.1620844 2.800513e-91 811.67623 0 +17 20 101355 20 154315 4796 75 87 50 0.0093012117 0.38312459 0.365511 0.13359828 1.7277662e-147 668.90615 0 +17 20 154315 20 101355 4796 75 87 50 0.0093012117 0.38312459 0.365511 0.13359828 1.7277662e-147 668.90615 0 +17 20 101355 20 154837 4494 377 26 111 0.019498834 0.78972942 0.40307775 0.16247168 2.1772141e-91 813.61538 0 +17 20 154837 20 101355 4494 377 26 111 0.019498834 0.78972942 0.40307775 0.16247168 2.1772141e-91 813.61538 0 +17 20 101355 20 155621 4734 137 27 110 0.020615617 0.79269522 0.5836575 0.34065607 2.05707e-128 1705.9256 0 +17 20 155621 20 101355 4734 137 27 110 0.020615617 0.79269522 0.5836575 0.34065607 2.05707e-128 1705.9256 0 +17 20 101355 20 156127 4796 75 85 52 0.0096896486 0.39283919 0.37784272 0.14276512 1.7945108e-157 714.81586 0 +17 20 156127 20 101355 4796 75 85 52 0.0096896486 0.39283919 0.37784272 0.14276512 1.7945108e-157 714.81586 0 +17 20 101355 20 156454 4796 75 81 56 0.01046652 0.41137841 0.40202177 0.1616215 5.2511156e-178 809.25325 0 +17 20 156454 20 101355 4796 75 81 56 0.01046652 0.41137841 0.40202177 0.1616215 5.2511156e-178 809.25325 0 +17 20 101355 20 156966 4566 305 29 108 0.019309478 0.76929533 0.43034032 0.18519278 4.039081e-95 927.39614 0 +17 20 156966 20 101355 4566 305 29 108 0.019309478 0.76929533 0.43034032 0.18519278 4.039081e-95 927.39614 0 +17 20 101355 20 157047 4425 446 28 109 0.018733485 0.77014762 0.36585164 0.13384742 2.2027474e-81 670.26988 0 +17 20 157047 20 101355 4425 446 28 109 0.018733485 0.77014762 0.36585164 0.13384742 2.2027474e-81 670.26988 0 +17 20 101355 20 157851 4425 446 29 108 0.018539269 0.7619921 0.36234468 0.13129367 6.6522222e-80 657.48046 0 +17 20 157851 20 101355 4425 446 29 108 0.018539269 0.7619921 0.36234468 0.13129367 6.6522222e-80 657.48046 0 +17 20 101355 20 157975 4423 448 30 107 0.018334124 0.75372958 0.3580524 0.12820153 2.9210767e-78 641.99521 0 +17 20 157975 20 101355 4423 448 30 107 0.018334124 0.75372958 0.3580524 0.12820153 2.9210767e-78 641.99521 0 +17 20 101355 20 157984 4425 446 29 108 0.018539269 0.7619921 0.36234468 0.13129367 6.6522222e-80 657.48046 0 +17 20 157984 20 101355 4425 446 29 108 0.018539269 0.7619921 0.36234468 0.13129367 6.6522222e-80 657.48046 0 +17 20 101355 20 158390 4425 446 29 108 0.018539269 0.7619921 0.36234468 0.13129367 6.6522222e-80 657.48046 0 +17 20 158390 20 101355 4425 446 29 108 0.018539269 0.7619921 0.36234468 0.13129367 6.6522222e-80 657.48046 0 +17 20 101355 20 158684 4425 446 28 109 0.018733485 0.77014762 0.36585164 0.13384742 2.2027474e-81 670.26988 0 +17 20 158684 20 101355 4425 446 28 109 0.018733485 0.77014762 0.36585164 0.13384742 2.2027474e-81 670.26988 0 +17 20 101355 20 158690 4425 446 28 109 0.018733485 0.77014762 0.36585164 0.13384742 2.2027474e-81 670.26988 0 +17 20 158690 20 101355 4425 446 28 109 0.018733485 0.77014762 0.36585164 0.13384742 2.2027474e-81 670.26988 0 +17 20 101355 20 158984 4794 77 82 55 0.010261377 0.40026003 0.39268664 0.15420279 6.2783737e-170 772.10143 0 +17 20 158984 20 101355 4794 77 82 55 0.010261377 0.40026003 0.39268664 0.15420279 6.2783737e-170 772.10143 0 +17 20 101355 20 159097 4420 451 29 108 0.018511955 0.76172453 0.36039144 0.12988198 1.892218e-79 650.41113 0 +17 20 159097 20 101355 4420 451 29 108 0.018511955 0.76172453 0.36039144 0.12988198 1.892218e-79 650.41113 0 +145 20 101355 20 159245 4320 551 29 108 0.017965704 0.75624573 0.3258113 0.10615299 3.0647915e-71 531.58134 0 +145 20 159245 20 101355 4320 551 29 108 0.017965704 0.75624573 0.3258113 0.10615299 3.0647915e-71 531.58134 0 +17 20 101355 20 159524 4795 76 82 55 0.01026684 0.40353015 0.39435199 0.1555135 2.349084e-171 778.66433 0 +17 20 159524 20 101355 4795 76 82 55 0.01026684 0.40353015 0.39435199 0.1555135 2.349084e-171 778.66433 0 +145 20 101355 20 159745 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 159745 20 101355 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 101355 20 160216 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 160216 20 101355 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 101355 20 161502 4320 551 29 108 0.017965704 0.75624573 0.3258113 0.10615299 3.0647915e-71 531.58134 0 +145 20 161502 20 101355 4320 551 29 108 0.017965704 0.75624573 0.3258113 0.10615299 3.0647915e-71 531.58134 0 +17 20 101355 20 161586 4795 76 82 55 0.01026684 0.40353015 0.39435199 0.1555135 2.349084e-171 778.66433 0 +17 20 161586 20 101355 4795 76 82 55 0.01026684 0.40353015 0.39435199 0.1555135 2.349084e-171 778.66433 0 +145 20 101355 20 161736 4271 600 27 110 0.018086478 0.77036345 0.317871 0.10104197 1.8537637e-70 505.98732 0 +145 20 161736 20 101355 4271 600 27 110 0.018086478 0.77036345 0.317871 0.10104197 1.8537637e-70 505.98732 0 +145 20 101355 20 162217 4320 551 29 108 0.017965704 0.75624573 0.3258113 0.10615299 3.0647915e-71 531.58134 0 +145 20 162217 20 101355 4320 551 29 108 0.017965704 0.75624573 0.3258113 0.10615299 3.0647915e-71 531.58134 0 +17 20 101355 20 162291 4795 76 82 55 0.01026684 0.40353015 0.39435199 0.1555135 2.349084e-171 778.66433 0 +17 20 162291 20 101355 4795 76 82 55 0.01026684 0.40353015 0.39435199 0.1555135 2.349084e-171 778.66433 0 +145 20 101355 20 162292 4338 533 29 108 0.018064028 0.75725031 0.3314769 0.10987693 1.3142146e-72 550.23015 0 +145 20 162292 20 101355 4338 533 29 108 0.018064028 0.75725031 0.3314769 0.10987693 1.3142146e-72 550.23015 0 +145 20 101355 20 164083 4322 549 29 108 0.017976629 0.75635773 0.3264302 0.10655668 2.1704563e-71 533.60287 0 +145 20 164083 20 101355 4322 549 29 108 0.017976629 0.75635773 0.3264302 0.10655668 2.1704563e-71 533.60287 0 +145 20 101355 20 164436 4322 549 29 108 0.017976629 0.75635773 0.3264302 0.10655668 2.1704563e-71 533.60287 0 +145 20 164436 20 101355 4322 549 29 108 0.017976629 0.75635773 0.3264302 0.10655668 2.1704563e-71 533.60287 0 +145 20 101355 20 165124 4318 553 28 109 0.018148998 0.76448858 0.32850203 0.10791359 1.7486855e-72 540.39852 0 +145 20 165124 20 101355 4318 553 28 109 0.018148998 0.76448858 0.32850203 0.10791359 1.7486855e-72 540.39852 0 +17 20 101355 20 165433 4795 76 81 56 0.010461058 0.40804884 0.40032807 0.16026258 1.5835754e-176 802.44885 0 +17 20 165433 20 101355 4795 76 81 56 0.010461058 0.40804884 0.40032807 0.16026258 1.5835754e-176 802.44885 0 +145 20 101355 20 165802 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 165802 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 101355 20 166163 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 166163 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 101355 20 167218 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 167218 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 101355 20 167325 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 167325 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 101355 20 167441 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 167441 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 101355 20 167455 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 167455 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 101355 20 167647 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 167647 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 101355 20 167774 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 167774 20 101355 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 101355 20 167900 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 167900 20 101355 4321 550 28 109 0.018165383 0.764651 0.32943252 0.10852578 1.0381314e-72 543.4643 0 +145 20 101355 20 168020 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +145 20 168020 20 101355 4322 549 28 109 0.018170848 0.76470518 0.32974401 0.10873111 8.7197572e-73 544.49245 0 +17 20 101355 20 169131 4795 76 81 56 0.010461058 0.40804884 0.40032807 0.16026258 1.5835754e-176 802.44885 0 +17 20 169131 20 101355 4795 76 81 56 0.010461058 0.40804884 0.40032807 0.16026258 1.5835754e-176 802.44885 0 +17 20 101355 20 170114 4795 76 81 56 0.010461058 0.40804884 0.40032807 0.16026258 1.5835754e-176 802.44885 0 +17 20 170114 20 101355 4795 76 81 56 0.010461058 0.40804884 0.40032807 0.16026258 1.5835754e-176 802.44885 0 +17 20 101362 20 144382 2778 1405 94 731 0.075703472 0.80132014 0.41264892 0.17027912 1.8308212e-187 852.7549 0 +17 20 144382 20 101362 2778 1405 94 731 0.075703472 0.80132014 0.41264892 0.17027912 1.8308212e-187 852.7549 0 +17 20 101362 20 144535 2800 1383 128 697 0.070756443 0.73463166 0.38708517 0.14983492 3.3335503e-165 750.3702 0 +17 20 144535 20 101362 2800 1383 128 697 0.070756443 0.73463166 0.38708517 0.14983492 3.3335503e-165 750.3702 0 +17 20 101362 20 144560 3035 1148 169 656 0.071648464 0.67981237 0.4023464 0.16188262 2.5388779e-178 810.7049 0 +17 20 144560 20 101362 3035 1148 169 656 0.071648464 0.67981237 0.4023464 0.16188262 2.5388779e-178 810.7049 0 +17 20 101362 20 153267 2582 1601 128 697 0.063585393 0.71328449 0.34399763 0.11833436 6.7599811e-131 592.61576 0 +17 20 153267 20 101362 2582 1601 128 697 0.063585393 0.71328449 0.34399763 0.11833436 6.7599811e-131 592.61576 0 +17 20 101362 20 153691 2689 1494 131 694 0.066604771 0.71801072 0.36200526 0.13104782 9.6022197e-145 656.28448 0 +17 20 153691 20 101362 2689 1494 131 694 0.066604771 0.71801072 0.36200526 0.13104782 9.6022197e-145 656.28448 0 +17 20 101362 20 154614 2691 1492 164 661 0.061166629 0.65130305 0.33307898 0.1109416 7.6451617e-123 555.59259 0 +17 20 154614 20 101362 2691 1492 164 661 0.061166629 0.65130305 0.33307898 0.1109416 7.6451617e-123 555.59259 0 +17 20 101362 20 155955 2550 1633 142 683 0.060197778 0.67979831 0.32548532 0.10594069 2.1465457e-117 530.54795 0 +17 20 155955 20 101362 2550 1633 142 683 0.060197778 0.67979831 0.32548532 0.10594069 2.1465457e-117 530.54795 0 +17 20 101362 20 156652 2546 1637 144 681 0.059732616 0.67504692 0.32295087 0.10429726 1.3251905e-115 522.31779 0 +17 20 156652 20 101362 2546 1637 144 681 0.059732616 0.67504692 0.32295087 0.10429726 1.3251905e-115 522.31779 0 +17 20 101362 20 157272 4080 103 270 555 0.089178003 0.81259215 0.71163601 0.5064258 0 2536.1739 0 +17 20 157272 20 101362 4080 103 270 555 0.089178003 0.81259215 0.71163601 0.5064258 0 2536.1739 0 +17 20 101362 20 158315 3474 709 232 593 0.075581707 0.61999184 0.46453267 0.2157906 5.2362717e-237 1080.6753 0 +17 20 158315 20 101362 3474 709 232 593 0.075581707 0.61999184 0.46453267 0.2157906 5.2362717e-237 1080.6753 0 +17 20 101362 20 158434 2933 1250 155 670 0.070628211 0.69530541 0.39160359 0.15335339 4.9164701e-169 767.99062 0 +17 20 158434 20 101362 2933 1250 155 670 0.070628211 0.69530541 0.39160359 0.15335339 4.9164701e-169 767.99062 0 +17 20 101362 20 159289 2951 1232 156 669 0.071053527 0.69521493 0.39471325 0.15579854 1.0694949e-171 780.23599 0 +17 20 159289 20 101362 2951 1232 156 669 0.071053527 0.69521493 0.39471325 0.15579854 1.0694949e-171 780.23599 0 +17 20 101362 20 160533 2563 1620 148 677 0.059624683 0.66860753 0.32258084 0.1040584 2.4127984e-115 521.12161 0 +17 20 160533 20 101362 2563 1620 148 677 0.059624683 0.66860753 0.32258084 0.1040584 2.4127984e-115 521.12161 0 +17 20 101362 20 162683 2562 1621 148 677 0.059591796 0.66848534 0.32239223 0.10393674 3.274134e-115 520.51224 0 +17 20 162683 20 101362 2562 1621 148 677 0.059591796 0.66848534 0.32239223 0.10393674 3.274134e-115 520.51224 0 +17 20 101362 20 164340 2702 1481 149 676 0.064030252 0.68275154 0.3485935 0.12151744 2.3053437e-134 608.5564 0 +17 20 164340 20 101362 2702 1481 149 676 0.064030252 0.68275154 0.3485935 0.12151744 2.3053437e-134 608.5564 0 +17 20 101362 20 166727 2563 1620 148 677 0.059624683 0.66860753 0.32258084 0.1040584 2.4127984e-115 521.12161 0 +17 20 166727 20 101362 2563 1620 148 677 0.059624683 0.66860753 0.32258084 0.1040584 2.4127984e-115 521.12161 0 +17 20 101362 20 166824 2561 1622 148 677 0.059558902 0.66836298 0.32220364 0.10381519 4.4417001e-115 519.90344 0 +17 20 166824 20 101362 2561 1622 148 677 0.059558902 0.66836298 0.32220364 0.10381519 4.4417001e-115 519.90344 0 +17 20 101362 20 167310 2563 1620 148 677 0.059624683 0.66860753 0.32258084 0.1040584 2.4127984e-115 521.12161 0 +17 20 167310 20 101362 2563 1620 148 677 0.059624683 0.66860753 0.32258084 0.1040584 2.4127984e-115 521.12161 0 +17 20 101362 20 168466 2940 1243 156 669 0.070691682 0.69413197 0.39226687 0.15387331 1.335062e-169 770.59447 0 +17 20 168466 20 101362 2940 1243 156 669 0.070691682 0.69413197 0.39226687 0.15387331 1.335062e-169 770.59447 0 +17 20 101362 20 169701 2560 1623 149 676 0.059359223 0.6661222 0.32112342 0.10312025 2.5394383e-114 516.42315 0 +17 20 169701 20 101362 2560 1623 149 676 0.059359223 0.6661222 0.32112342 0.10312025 2.5394383e-114 516.42315 0 +401 20 101437 20 143918 311 4650 45 2 -0.0083184801 0.95419043 -0.33573315 0.11271676 1.3608031e-50 564.32278 0 +401 20 143918 20 101437 311 4650 45 2 -0.0083184801 0.95419043 -0.33573315 0.11271676 1.3608031e-50 564.32278 0 +273 20 101437 20 148647 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 148647 20 101437 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 101437 20 153677 4869 92 2 45 0.0087288842 0.95624983 0.55498916 0.30801296 1.6763985e-71 1542.1256 0 +273 20 153677 20 101437 4869 92 2 45 0.0087288842 0.95624983 0.55498916 0.30801296 1.6763985e-71 1542.1256 0 +273 20 101437 20 153997 4957 4 2 45 0.0088937962 0.95702618 0.93710274 0.87816149 1.0917944e-102 4396.7248 0 +273 20 153997 20 101437 4957 4 2 45 0.0088937962 0.95702618 0.93710274 0.87816149 1.0917944e-102 4396.7248 0 +273 20 101437 20 154710 4600 361 1 46 0.0084225861 0.97684133 0.31968132 0.10219613 2.7447492e-50 511.65442 0 +273 20 154710 20 101437 4600 361 1 46 0.0084225861 0.97684133 0.31968132 0.10219613 2.7447492e-50 511.65442 0 +273 20 101437 20 154980 4939 22 2 45 0.0088600647 0.95686972 0.79981065 0.63969707 1.3879015e-90 3202.7894 0 +273 20 154980 20 101437 4939 22 2 45 0.0088600647 0.95686972 0.79981065 0.63969707 1.3879015e-90 3202.7894 0 +273 20 101437 20 155263 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 155263 20 101437 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 101437 20 155378 4735 226 2 45 0.0084777698 0.95501244 0.38863364 0.1510361 2.4985554e-56 756.17891 0 +273 20 155378 20 101437 4735 226 2 45 0.0084777698 0.95501244 0.38863364 0.1510361 2.4985554e-56 756.17891 0 +273 20 101437 20 155627 4735 226 2 45 0.0084777698 0.95501244 0.38863364 0.1510361 2.4985554e-56 756.17891 0 +273 20 155627 20 101437 4735 226 2 45 0.0084777698 0.95501244 0.38863364 0.1510361 2.4985554e-56 756.17891 0 +273 20 101437 20 155684 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 155684 20 101437 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 101437 20 155880 4742 219 3 44 0.0082930811 0.93263233 0.38558045 0.1486723 1.867822e-54 744.33501 0 +273 20 155880 20 101437 4742 219 3 44 0.0082930811 0.93263233 0.38558045 0.1486723 1.867822e-54 744.33501 0 +273 20 101437 20 156051 4629 332 1 46 0.0084769325 0.97698641 0.33281079 0.11076303 7.456458e-52 554.54734 0 +273 20 156051 20 101437 4629 332 1 46 0.0084769325 0.97698641 0.33281079 0.11076303 7.456458e-52 554.54734 0 +273 20 101437 20 158264 4811 150 2 45 0.0086201932 0.95572269 0.46215418 0.21358648 1.8433382e-63 1069.3541 0 +273 20 158264 20 101437 4811 150 2 45 0.0086201932 0.95572269 0.46215418 0.21358648 1.8433382e-63 1069.3541 0 +273 20 101437 20 159941 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 159941 20 101437 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 101437 20 160331 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 160331 20 101437 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 101437 20 162871 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 162871 20 101437 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +789 20 101437 20 167460 4961 0 21 26 0.0051429695 1.0000001 0.7421993 0.55085975 3.4601114e-57 2758.7053 0 +789 20 167460 20 101437 4961 0 21 26 0.0051429695 1.0000001 0.7421993 0.55085975 3.4601114e-57 2758.7053 0 +789 20 101437 20 167702 4961 0 21 26 0.0051429695 1.0000001 0.7421993 0.55085975 3.4601114e-57 2758.7053 0 +789 20 167702 20 101437 4961 0 21 26 0.0051429695 1.0000001 0.7421993 0.55085975 3.4601114e-57 2758.7053 0 +273 20 101437 20 167848 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 167848 20 101437 4902 59 2 45 0.0087907268 0.95654434 0.63933426 0.4087483 3.1301287e-78 2046.4836 0 +273 20 101437 20 169850 4900 61 2 45 0.0087869791 0.95652664 0.63313323 0.40085769 9.5110775e-78 2006.977 0 +273 20 169850 20 101437 4900 61 2 45 0.0087869791 0.95652664 0.63313323 0.40085769 9.5110775e-78 2006.977 0 +17 20 101439 20 140467 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 140467 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 141656 4878 70 1 59 0.011472538 0.98289263 0.6656369 0.4430725 1.8150342e-99 2218.5713 0 +17 20 141656 20 101439 4878 70 1 59 0.011472538 0.98289263 0.6656369 0.4430725 1.8150342e-99 2218.5713 0 +17 20 101439 20 144551 4767 181 1 59 0.011206989 0.98249441 0.48222905 0.23254485 4.7800704e-80 1164.3998 0 +17 20 144551 20 101439 4767 181 1 59 0.011206989 0.98249441 0.48222905 0.23254485 4.7800704e-80 1164.3998 0 +17 20 101439 20 146274 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 146274 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 147745 4912 36 1 59 0.011553879 0.98301119 0.77845013 0.60598457 1.2219956e-110 3034.3172 0 +17 20 147745 20 101439 4912 36 1 59 0.011553879 0.98301119 0.77845013 0.60598457 1.2219956e-110 3034.3172 0 +17 20 101439 20 153571 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 153571 20 101439 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 101439 20 153572 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 153572 20 101439 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 101439 20 156111 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 156111 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 156299 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 156299 20 101439 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 101439 20 156300 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 156300 20 101439 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 101439 20 156420 4940 8 14 46 0.0090561165 0.85005534 0.80594474 0.64954698 7.8135062e-91 3252.1412 0 +17 20 156420 20 101439 4940 8 14 46 0.0090561165 0.85005534 0.80594474 0.64954698 7.8135062e-91 3252.1412 0 +17 20 101439 20 156615 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 156615 20 101439 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 101439 20 156621 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 156621 20 101439 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 101439 20 156755 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 156755 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 156769 4755 193 1 59 0.01117828 0.98245019 0.46999365 0.22089402 1.2720933e-78 1106.0608 0 +17 20 156769 20 101439 4755 193 1 59 0.01117828 0.98245019 0.46999365 0.22089402 1.2720933e-78 1106.0608 0 +17 20 101439 20 157523 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 157523 20 101439 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 101439 20 158264 4812 136 1 59 0.011314644 0.98265803 0.53759325 0.2890065 3.0547515e-86 1447.1194 0 +17 20 158264 20 101439 4812 136 1 59 0.011314644 0.98265803 0.53759325 0.2890065 3.0547515e-86 1447.1194 0 +17 20 101439 20 158498 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 158498 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 158746 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 158746 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 159819 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 159819 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 162684 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 162684 20 101439 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 101439 20 163483 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 163483 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 164595 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 164595 20 101439 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 101439 20 165125 4913 35 1 59 0.011556271 0.98301452 0.78266221 0.61256015 4.631647e-111 3067.2431 0 +17 20 165125 20 101439 4913 35 1 59 0.011556271 0.98301452 0.78266221 0.61256015 4.631647e-111 3067.2431 0 +17 20 101439 20 165704 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 165704 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 101439 20 167301 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 167301 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +529 20 101439 20 168134 4947 1 51 9 0.0017732012 0.89878732 0.3650884 0.13328955 2.7000615e-17 663.28172 0 +529 20 168134 20 101439 4947 1 51 9 0.0017732012 0.89878732 0.3650884 0.13328955 2.7000615e-17 663.28172 0 +17 20 101439 20 170805 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 170805 20 101439 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +785 20 101899 20 143584 4978 16 2 12 0.0023805364 0.85633969 0.60467422 0.36563095 5.3660908e-27 1824.6379 0 +785 20 143584 20 101899 4978 16 2 12 0.0023805364 0.85633969 0.60467422 0.36563095 5.3660908e-27 1824.6379 0 +145 20 101918 20 143918 247 4615 109 37 -0.019692771 0.72718167 -0.45551923 0.20749778 1.8081547e-99 1039.0956 0 +145 20 143918 20 101918 247 4615 109 37 -0.019692771 0.72718167 -0.45551923 0.20749778 1.8081547e-99 1039.0956 0 +145 20 101918 20 145210 305 4557 108 38 -0.019161275 0.7163325 -0.41404861 0.17143627 8.9899562e-90 858.50621 0 +145 20 145210 20 101918 305 4557 108 38 -0.019161275 0.7163325 -0.41404861 0.17143627 8.9899562e-90 858.50621 0 +145 20 101918 20 148084 296 4566 108 38 -0.019213665 0.716887 -0.41936928 0.1758706 6.128837e-91 880.71254 0 +145 20 148084 20 101918 296 4566 108 38 -0.019213665 0.716887 -0.41936928 0.1758706 6.128837e-91 880.71254 0 +17 20 101918 20 152087 4634 228 55 91 0.016313912 0.59765929 0.39707029 0.15766482 1.0214712e-173 789.52641 0 +17 20 152087 20 101918 4634 228 55 91 0.016313912 0.59765929 0.39707029 0.15766482 1.0214712e-173 789.52641 0 +17 20 101918 20 157234 4806 56 46 100 0.019059919 0.67480153 0.65214336 0.42529097 4.1559891e-131 2129.7405 0 +17 20 157234 20 101918 4806 56 46 100 0.019059919 0.67480153 0.65214336 0.42529097 4.1559891e-131 2129.7405 0 +17 20 101918 20 168546 4852 10 101 45 0.0086654481 0.81272203 0.49421996 0.24425337 1.1902079e-62 1222.896 0 +17 20 168546 20 101918 4852 10 101 45 0.0086654481 0.81272203 0.49421996 0.24425337 1.1902079e-62 1222.896 0 +209 20 102181 20 140171 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 140171 20 102181 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +81 20 102181 20 140280 1821 1664 158 1365 0.088626288 0.73747194 0.39406452 0.15528685 3.855726e-171 777.67453 0 +81 20 140280 20 102181 1821 1664 158 1365 0.088626288 0.73747194 0.39406452 0.15528685 3.855726e-171 777.67453 0 +209 20 102181 20 140505 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 140505 20 102181 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 102181 20 140506 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 140506 20 102181 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 102181 20 140695 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 140695 20 102181 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 102181 20 140853 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 140853 20 102181 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 102181 20 140953 1921 1564 1467 56 -0.087193236 0.88633221 -0.40516403 0.1641579 8.4541373e-181 822.10062 0 +209 20 140953 20 102181 1921 1564 1467 56 -0.087193236 0.88633221 -0.40516403 0.1641579 8.4541373e-181 822.10062 0 +209 20 102181 20 141025 1947 1538 1466 57 -0.085475422 0.88248897 -0.39881369 0.15905236 3.062415e-175 796.53202 0 +209 20 141025 20 102181 1947 1538 1466 57 -0.085475422 0.88248897 -0.39881369 0.15905236 3.062415e-175 796.53202 0 +209 20 102181 20 141029 1933 1552 1465 58 -0.086186625 0.88154143 -0.40113685 0.16091077 2.9012717e-177 805.83888 0 +209 20 141029 20 102181 1933 1552 1465 58 -0.086186625 0.88154143 -0.40113685 0.16091077 2.9012717e-177 805.83888 0 +209 20 102181 20 141049 1945 1540 1466 57 -0.085596867 0.88263613 -0.3992472 0.15939832 1.2863727e-175 798.26461 0 +209 20 141049 20 102181 1945 1540 1466 57 -0.085596867 0.88263613 -0.3992472 0.15939832 1.2863727e-175 798.26461 0 +209 20 102181 20 141054 1945 1540 1466 57 -0.085596867 0.88263613 -0.3992472 0.15939832 1.2863727e-175 798.26461 0 +209 20 141054 20 102181 1945 1540 1466 57 -0.085596867 0.88263613 -0.3992472 0.15939832 1.2863727e-175 798.26461 0 +209 20 102181 20 141074 1942 1543 1466 57 -0.085779041 0.88285613 -0.39989752 0.15991801 3.4952307e-176 800.86739 0 +209 20 141074 20 102181 1942 1543 1466 57 -0.085779041 0.88285613 -0.39989752 0.15991801 3.4952307e-176 800.86739 0 +209 20 102181 20 141514 2177 1308 1470 53 -0.072064362 0.8719492 -0.3521283 0.12399432 4.6203588e-137 620.96131 0 +209 20 141514 20 102181 2177 1308 1470 53 -0.072064362 0.8719492 -0.3521283 0.12399432 4.6203588e-137 620.96131 0 +209 20 102181 20 141769 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +209 20 141769 20 102181 1917 1568 1467 56 -0.08743614 0.88661212 -0.40603185 0.16486186 1.4474898e-181 825.626 0 +81 20 102181 20 143447 1820 1665 160 1363 0.088287652 0.73428303 0.39252445 0.15407546 8.0379235e-170 771.60795 0 +81 20 143447 20 102181 1820 1665 160 1363 0.088287652 0.73428303 0.39252445 0.15407546 8.0379235e-170 771.60795 0 +209 20 102181 20 143495 1917 1568 1466 57 -0.087297179 0.88465834 -0.40532166 0.16428566 6.136812e-181 822.74055 0 +209 20 143495 20 102181 1917 1568 1466 57 -0.087297179 0.88465834 -0.40532166 0.16428566 6.136812e-181 822.74055 0 +209 20 102181 20 143573 1914 1571 1400 123 -0.078308336 0.76124305 -0.35979271 0.1294508 5.2671875e-143 648.28735 0 +209 20 143573 20 102181 1914 1571 1400 123 -0.078308336 0.76124305 -0.35979271 0.1294508 5.2671875e-143 648.28735 0 +209 20 102181 20 143723 1904 1581 1466 57 -0.08808662 0.88557386 -0.40814582 0.16658302 1.9349428e-183 834.24549 0 +209 20 143723 20 102181 1904 1581 1466 57 -0.08808662 0.88557386 -0.40814582 0.16658302 1.9349428e-183 834.24549 0 +209 20 102181 20 143754 1905 1580 1441 82 -0.084552012 0.83776397 -0.39032176 0.15235108 6.0648223e-168 762.97216 0 +209 20 143754 20 102181 1905 1580 1441 82 -0.084552012 0.83776397 -0.39032176 0.15235108 6.0648223e-168 762.97216 0 +81 20 102181 20 145413 1938 1547 168 1355 0.094341621 0.73768967 0.41543469 0.17258598 5.6345483e-190 864.30868 0 +81 20 145413 20 102181 1938 1547 168 1355 0.094341621 0.73768967 0.41543469 0.17258598 5.6345483e-190 864.30868 0 +209 20 102181 20 145853 1881 1604 1463 60 -0.089066438 0.88143349 -0.41103765 0.16895196 5.098446e-186 846.1092 0 +209 20 145853 20 102181 1881 1604 1463 60 -0.089066438 0.88143349 -0.41103765 0.16895196 5.098446e-186 846.1092 0 +209 20 102181 20 146951 1844 1641 1463 60 -0.091313288 0.88401246 -0.4191235 0.1756645 2.5076159e-193 879.72572 0 +209 20 146951 20 102181 1844 1641 1463 60 -0.091313288 0.88401246 -0.4191235 0.1756645 2.5076159e-193 879.72572 0 +209 20 102181 20 147497 1845 1640 1463 60 -0.09125255 0.88394421 -0.41890457 0.17548105 3.9716083e-193 878.80709 0 +209 20 147497 20 102181 1845 1640 1463 60 -0.09125255 0.88394421 -0.41890457 0.17548105 3.9716083e-193 878.80709 0 +209 20 102181 20 147915 1844 1641 1462 61 -0.091174334 0.88214868 -0.41842604 0.17508034 1.0845794e-192 876.80015 0 +209 20 147915 20 102181 1844 1641 1462 61 -0.091174334 0.88214868 -0.41842604 0.17508034 1.0845794e-192 876.80015 0 +209 20 102181 20 147917 1844 1641 1462 61 -0.091174334 0.88214868 -0.41842604 0.17508034 1.0845794e-192 876.80015 0 +209 20 147917 20 102181 1844 1641 1462 61 -0.091174334 0.88214868 -0.41842604 0.17508034 1.0845794e-192 876.80015 0 +81 20 102181 20 149407 1901 1584 179 1344 0.090566278 0.71702105 0.39950904 0.1596075 7.6135734e-176 799.31226 0 +81 20 149407 20 102181 1901 1584 179 1344 0.090566278 0.71702105 0.39950904 0.1596075 7.6135734e-176 799.31226 0 +209 20 102181 20 149975 1808 1677 199 1324 0.082139701 0.67396057 0.36435416 0.13275397 1.3303447e-146 664.82986 0 +209 20 149975 20 102181 1808 1677 199 1324 0.082139701 0.67396057 0.36435416 0.13275397 1.3303447e-146 664.82986 0 +81 20 102181 20 151330 1902 1583 171 1352 0.091738641 0.7287553 0.40487963 0.16392751 1.5062334e-180 820.94694 0 +81 20 151330 20 102181 1902 1583 171 1352 0.091738641 0.7287553 0.40487963 0.16392751 1.5062334e-180 820.94694 0 +209 20 102181 20 152570 1800 1685 99 1424 0.095549397 0.82857484 0.42808738 0.1832588 1.3535802e-201 917.75801 0 +209 20 152570 20 102181 1800 1685 99 1424 0.095549397 0.82857484 0.42808738 0.1832588 1.3535802e-201 917.75801 0 +81 20 102181 20 152972 1871 1614 172 1351 0.089717194 0.72316277 0.39683181 0.15747549 1.5958695e-173 788.63519 0 +81 20 152972 20 102181 1871 1614 172 1351 0.089717194 0.72316277 0.39683181 0.15747549 1.5958695e-173 788.63519 0 +81 20 102181 20 152992 1871 1614 173 1350 0.089578249 0.72168952 0.3961871 0.15696421 5.7505951e-173 786.07466 0 +81 20 152992 20 102181 1871 1614 173 1350 0.089578249 0.72168952 0.3961871 0.15696421 5.7505951e-173 786.07466 0 +81 20 102181 20 153393 2081 1404 341 1182 0.07898616 0.53703874 0.34357929 0.11804673 1.3902213e-130 591.17613 0 +81 20 153393 20 102181 2081 1404 341 1182 0.07898616 0.53703874 0.34357929 0.11804673 1.3902213e-130 591.17613 0 +81 20 102181 20 155910 1818 1667 120 1403 0.0937244 0.79639339 0.41829619 0.1749717 1.4239651e-192 876.25627 0 +81 20 155910 20 102181 1818 1667 120 1403 0.0937244 0.79639339 0.41829619 0.1749717 1.4239651e-192 876.25627 0 +81 20 102181 20 156329 1825 1660 140 1383 0.091370381 0.76572293 0.40677148 0.16546305 3.206492e-182 828.63685 0 +81 20 156329 20 102181 1825 1660 140 1383 0.091370381 0.76572293 0.40677148 0.16546305 3.206492e-182 828.63685 0 +81 20 102181 20 156330 1825 1660 140 1383 0.091370381 0.76572293 0.40677148 0.16546305 3.206492e-182 828.63685 0 +81 20 156330 20 102181 1825 1660 140 1383 0.091370381 0.76572293 0.40677148 0.16546305 3.206492e-182 828.63685 0 +81 20 102181 20 156534 1823 1662 123 1400 0.093611166 0.79216123 0.41747546 0.17428577 7.9488226e-192 872.82103 0 +81 20 156534 20 102181 1823 1662 123 1400 0.093611166 0.79216123 0.41747546 0.17428577 7.9488226e-192 872.82103 0 +81 20 102181 20 156538 1823 1662 123 1400 0.093611166 0.79216123 0.41747546 0.17428577 7.9488226e-192 872.82103 0 +81 20 156538 20 102181 1823 1662 123 1400 0.093611166 0.79216123 0.41747546 0.17428577 7.9488226e-192 872.82103 0 +81 20 102181 20 157157 1819 1666 124 1399 0.093229301 0.79014784 0.41588962 0.17296419 2.1832009e-190 866.20275 0 +81 20 157157 20 102181 1819 1666 124 1399 0.093229301 0.79014784 0.41588962 0.17296419 2.1832009e-190 866.20275 0 +81 20 102181 20 157746 1820 1665 121 1402 0.093706891 0.7950139 0.41809905 0.17480682 2.1527175e-192 875.43064 0 +81 20 157746 20 102181 1820 1665 121 1402 0.093706891 0.7950139 0.41809905 0.17480682 2.1527175e-192 875.43064 0 +81 20 102181 20 157976 1819 1666 123 1400 0.093368262 0.79173309 0.41654879 0.17351289 5.5171297e-191 868.95063 0 +81 20 157976 20 102181 1819 1666 123 1400 0.093368262 0.79173309 0.41654879 0.17351289 5.5171297e-191 868.95063 0 +209 20 102181 20 158114 1851 1634 123 1400 0.095311485 0.79510927 0.42397591 0.17975558 8.8175349e-198 900.21378 0 +209 20 158114 20 102181 1851 1634 123 1400 0.095311485 0.79510927 0.42397591 0.17975558 8.8175349e-198 900.21378 0 +81 20 102181 20 158495 1820 1665 123 1400 0.093428984 0.79184026 0.41678038 0.17370591 3.4007637e-191 869.91723 0 +81 20 158495 20 102181 1820 1665 123 1400 0.093428984 0.79184026 0.41678038 0.17370591 3.4007637e-191 869.91723 0 +209 20 102181 20 160208 1717 1768 81 1442 0.093010366 0.85186422 0.42146477 0.17763256 1.8055523e-195 889.58188 0 +209 20 160208 20 102181 1717 1768 81 1442 0.093010366 0.85186422 0.42146477 0.17763256 1.8055523e-195 889.58188 0 +81 20 102181 20 160551 1822 1663 129 1394 0.092716709 0.78258133 0.41329387 0.17081182 4.8136344e-188 855.42358 0 +81 20 160551 20 102181 1822 1663 129 1394 0.092716709 0.78258133 0.41329387 0.17081182 4.8136344e-188 855.42358 0 +81 20 102181 20 161450 1822 1663 130 1393 0.092577755 0.78100818 0.41263625 0.17026865 1.8785813e-187 852.70346 0 +81 20 161450 20 102181 1822 1663 130 1393 0.092577755 0.78100818 0.41263625 0.17026865 1.8785813e-187 852.70346 0 +81 20 102181 20 165908 1823 1662 129 1394 0.092777431 0.78269267 0.41352624 0.17100395 2.973598e-188 856.38582 0 +81 20 165908 20 102181 1823 1662 129 1394 0.092777431 0.78269267 0.41352624 0.17100395 2.973598e-188 856.38582 0 +81 20 102181 20 167431 1823 1662 129 1394 0.092777431 0.78269267 0.41352624 0.17100395 2.973598e-188 856.38582 0 +81 20 167431 20 102181 1823 1662 129 1394 0.092777431 0.78269267 0.41352624 0.17100395 2.973598e-188 856.38582 0 +81 20 102181 20 168728 1822 1663 129 1394 0.092716709 0.78258133 0.41329387 0.17081182 4.8136344e-188 855.42358 0 +81 20 168728 20 102181 1822 1663 129 1394 0.092716709 0.78258133 0.41329387 0.17081182 4.8136344e-188 855.42358 0 +81 20 102181 20 169995 1819 1666 129 1394 0.092534527 0.78224641 0.41259688 0.17023617 2.0378944e-187 852.54085 0 +81 20 169995 20 102181 1819 1666 129 1394 0.092534527 0.78224641 0.41259688 0.17023617 2.0378944e-187 852.54085 0 +81 20 102181 20 170589 1819 1666 129 1394 0.092534527 0.78224641 0.41259688 0.17023617 2.0378944e-187 852.54085 0 +81 20 170589 20 102181 1819 1666 129 1394 0.092534527 0.78224641 0.41259688 0.17023617 2.0378944e-187 852.54085 0 +81 20 102181 20 170642 1864 1621 235 1288 0.08053796 0.63185424 0.35481328 0.12589245 3.9553147e-139 630.46733 0 +81 20 170642 20 102181 1864 1621 235 1288 0.08053796 0.63185424 0.35481328 0.12589245 3.9553147e-139 630.46733 0 +17 20 102205 20 141092 4811 109 3 85 0.016292144 0.96453524 0.64258271 0.41291258 6.5375641e-125 2067.711 0 +17 20 141092 20 102205 4811 109 3 85 0.016292144 0.96453524 0.64258271 0.41291258 6.5375641e-125 2067.711 0 +17 20 102205 20 149381 4891 29 6 82 0.015984328 0.93027258 0.82636631 0.68288118 8.6204478e-146 3419.603 0 +17 20 149381 20 102205 4891 29 6 82 0.015984328 0.93027258 0.82636631 0.68288118 8.6204478e-146 3419.603 0 +17 20 102205 20 149599 4892 28 6 82 0.015987838 0.93028694 0.83021152 0.68925124 2.2549178e-146 3451.5007 0 +17 20 149599 20 102205 4892 28 6 82 0.015987838 0.93028694 0.83021152 0.68925124 2.2549178e-146 3451.5007 0 +17 20 102205 20 151677 4892 28 6 82 0.015987838 0.93028694 0.83021152 0.68925124 2.2549178e-146 3451.5007 0 +17 20 151677 20 102205 4892 28 6 82 0.015987838 0.93028694 0.83021152 0.68925124 2.2549178e-146 3451.5007 0 +17 20 102205 20 154403 4740 180 7 81 0.015258335 0.91608101 0.52249485 0.27300087 9.940814e-101 1367.0712 0 +17 20 154403 20 102205 4740 180 7 81 0.015258335 0.91608101 0.52249485 0.27300087 9.940814e-101 1367.0712 0 +17 20 102205 20 154809 4740 180 7 81 0.015258335 0.91608101 0.52249485 0.27300087 9.940814e-101 1367.0712 0 +17 20 154809 20 102205 4740 180 7 81 0.015258335 0.91608101 0.52249485 0.27300087 9.940814e-101 1367.0712 0 +145 20 102205 20 155449 4596 324 7 81 0.014753073 0.91345572 0.41185018 0.16962056 4.8008034e-83 849.38188 0 +145 20 155449 20 102205 4596 324 7 81 0.014753073 0.91345572 0.41185018 0.16962056 4.8008034e-83 849.38188 0 +17 20 102205 20 155732 4755 165 7 81 0.015310965 0.91634524 0.53919399 0.29073015 3.1914887e-103 1455.8531 0 +17 20 155732 20 102205 4755 165 7 81 0.015310965 0.91634524 0.53919399 0.29073015 3.1914887e-103 1455.8531 0 +17 20 102205 20 156532 4887 33 18 70 0.013616233 0.79115933 0.73016936 0.53314728 4.1953831e-114 2669.7147 0 +17 20 156532 20 102205 4887 33 18 70 0.013616233 0.79115933 0.73016936 0.53314728 4.1953831e-114 2669.7147 0 +17 20 102205 20 158744 4756 164 8 80 0.015118303 0.9044348 0.53447449 0.28566298 4.2225621e-101 1430.4756 0 +17 20 158744 20 102205 4756 164 8 80 0.015118303 0.9044348 0.53447449 0.28566298 4.2225621e-101 1430.4756 0 +17 20 102205 20 160207 4754 166 8 80 0.015111285 0.90439463 0.53216207 0.28319645 9.2607029e-101 1418.124 0 +17 20 160207 20 102205 4754 166 8 80 0.015111285 0.90439463 0.53216207 0.28319645 9.2607029e-101 1418.124 0 +17 20 102205 20 162810 4891 29 9 79 0.015395814 0.89547306 0.80667287 0.65072119 8.0311263e-138 3258.5379 0 +17 20 162810 20 102205 4891 29 9 79 0.015395814 0.89547306 0.80667287 0.65072119 8.0311263e-138 3258.5379 0 +17 20 102205 20 167599 4916 4 25 63 0.012344785 0.93923068 0.81779313 0.66878563 1.1083857e-118 3348.8402 0 +17 20 167599 20 102205 4916 4 25 63 0.012344785 0.93923068 0.81779313 0.66878563 1.1083857e-118 3348.8402 0 +17 20 102205 20 167643 4891 29 8 80 0.015591985 0.90706813 0.81327826 0.6614216 2.0096625e-140 3312.1281 0 +17 20 167643 20 102205 4891 29 8 80 0.015591985 0.90706813 0.81327826 0.6614216 2.0096625e-140 3312.1281 0 +17 20 102267 20 140467 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 140467 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 141656 4878 70 1 59 0.011472538 0.98289263 0.6656369 0.4430725 1.8150342e-99 2218.5713 0 +17 20 141656 20 102267 4878 70 1 59 0.011472538 0.98289263 0.6656369 0.4430725 1.8150342e-99 2218.5713 0 +17 20 102267 20 144551 4767 181 1 59 0.011206989 0.98249441 0.48222905 0.23254485 4.7800704e-80 1164.3998 0 +17 20 144551 20 102267 4767 181 1 59 0.011206989 0.98249441 0.48222905 0.23254485 4.7800704e-80 1164.3998 0 +17 20 102267 20 146274 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 146274 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 147745 4912 36 1 59 0.011553879 0.98301119 0.77845013 0.60598457 1.2219956e-110 3034.3172 0 +17 20 147745 20 102267 4912 36 1 59 0.011553879 0.98301119 0.77845013 0.60598457 1.2219956e-110 3034.3172 0 +17 20 102267 20 153571 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 153571 20 102267 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 102267 20 153572 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 153572 20 102267 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 102267 20 156111 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 156111 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 156299 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 156299 20 102267 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 102267 20 156300 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 156300 20 102267 4854 94 3 57 0.011020546 0.94844556 0.59233683 0.35086295 4.647882e-88 1756.8332 0 +17 20 102267 20 156420 4940 8 14 46 0.0090561165 0.85005534 0.80594474 0.64954698 7.8135062e-91 3252.1412 0 +17 20 156420 20 102267 4940 8 14 46 0.0090561165 0.85005534 0.80594474 0.64954698 7.8135062e-91 3252.1412 0 +17 20 102267 20 156615 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 156615 20 102267 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 102267 20 156621 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 156621 20 102267 4939 9 6 54 0.01063203 0.89872593 0.87680072 0.76877952 5.9566489e-111 3849.3697 0 +17 20 102267 20 156755 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 156755 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 156769 4755 193 1 59 0.01117828 0.98245019 0.46999365 0.22089402 1.2720933e-78 1106.0608 0 +17 20 156769 20 102267 4755 193 1 59 0.01117828 0.98245019 0.46999365 0.22089402 1.2720933e-78 1106.0608 0 +17 20 102267 20 157523 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 157523 20 102267 4938 10 1 59 0.011616079 0.98310053 0.91591185 0.83889449 2.0859391e-125 4200.5627 0 +17 20 102267 20 158264 4812 136 1 59 0.011314644 0.98265803 0.53759325 0.2890065 3.0547515e-86 1447.1194 0 +17 20 158264 20 102267 4812 136 1 59 0.011314644 0.98265803 0.53759325 0.2890065 3.0547515e-86 1447.1194 0 +17 20 102267 20 158498 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 158498 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 158746 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 158746 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 159819 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 159819 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 162684 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 162684 20 102267 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 102267 20 163483 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 163483 20 102267 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 +17 20 102267 20 164595 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 164595 20 102267 4917 31 1 59 0.01156584 0.98302829 0.80020237 0.64032382 7.9589129e-113 3206.2637 0 +17 20 102267 20 165125 4913 35 1 59 0.011556271 0.98301452 0.78266221 0.61256015 4.631647e-111 3067.2431 0 +17 20 165125 20 102267 4913 35 1 59 0.011556271 0.98301452 0.78266221 0.61256015 4.631647e-111 3067.2431 0 +17 20 102267 20 165704 4939 9 1 59 0.011618471 0.98310393 0.92271847 0.85140938 3.0236979e-126 4263.2286 0 diff --git a/test_files/view/1kgp3_chr20_subset_20k.twk b/test_files/view/1kgp3_chr20_subset_20k.twk deleted file mode 100644 index 3545066..0000000 Binary files a/test_files/view/1kgp3_chr20_subset_20k.twk and /dev/null differ diff --git a/test_files/view/1kgp3_chr20_subset_20k.twk.twi b/test_files/view/1kgp3_chr20_subset_20k.twk.twi deleted file mode 100644 index 572a7d0..0000000 Binary files a/test_files/view/1kgp3_chr20_subset_20k.twk.twi and /dev/null differ diff --git a/test_files/view/1kgp3_chr20_subset_20k.two b/test_files/view/1kgp3_chr20_subset_20k.two deleted file mode 100644 index 978caf6..0000000 Binary files a/test_files/view/1kgp3_chr20_subset_20k.two and /dev/null differ diff --git a/test_files/view/1kgp3_chr20_subset_20k.two.toi b/test_files/view/1kgp3_chr20_subset_20k.two.toi deleted file mode 100644 index 78ce22d..0000000 Binary files a/test_files/view/1kgp3_chr20_subset_20k.two.toi and /dev/null differ