diff --git a/.gitmodules b/.gitmodules index 3babdbb..d5338be 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "lib/third_party/xxhash"] path = lib/third_party/xxhash url = https://github.com/Cyan4973/xxHash -[submodule "lib/third_party/zlib"] - path = lib/third_party/zlib - url = https://github.com/cloudflare/zlib diff --git a/.settings/language.settings.xml b/.settings/language.settings.xml index 202222a..28465a1 100644 --- a/.settings/language.settings.xml +++ b/.settings/language.settings.xml @@ -1,52 +1,27 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.travis.yml b/.travis.yml index 036f928..f999aa0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,8 +11,16 @@ matrix: - ubuntu-toolchain-r-test packages: - g++-4.9 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" compiler: gcc @@ -25,8 +33,16 @@ matrix: - ubuntu-toolchain-r-test packages: - g++-5 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=gcc-5 && CXX=g++-5" compiler: gcc @@ -39,8 +55,16 @@ matrix: - ubuntu-toolchain-r-test packages: - g++-6 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=gcc-6 && CXX=g++-6" compiler: gcc @@ -54,8 +78,16 @@ matrix: - llvm-toolchain-precise-3.6 packages: - clang-3.6 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=clang-3.6 && CXX=clang++-3.6" compiler: clang @@ -69,8 +101,16 @@ matrix: - llvm-toolchain-precise-3.7 packages: - clang-3.7 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=clang-3.7 && CXX=clang++-3.7" compiler: clang @@ -84,8 +124,16 @@ matrix: - llvm-toolchain-precise-3.8 packages: - clang-3.8 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=clang-3.8 && CXX=clang++-3.8" compiler: clang @@ -98,8 +146,16 @@ matrix: - llvm-toolchain-trusty-3.9 packages: - clang-3.9 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=clang-3.9 && CXX=clang++-3.9" compiler: clang @@ -112,8 +168,16 @@ matrix: - llvm-toolchain-trusty-4.0 packages: - clang-4.0 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=clang-4.0 && CXX=clang++-4.0" compiler: clang @@ -126,23 +190,28 @@ matrix: - llvm-toolchain-trusty-5.0 packages: - clang-5.0 + - pkg-config + - libncurses5-dev + - libncursesw5-dev + - zlib1g-dev - libssl-dev - liblz4-dev + - libcurl4-openssl-dev + - liblz-dev + - libbz2-dev + - liblzma-dev env: - MATRIX_EVAL="CC=clang-5.0 && CXX=clang++-5.0" compiler: clang before_install: - eval "${MATRIX_EVAL}" - - git clone --recursive https://github.com/facebook/zstd - - cd zstd - - make -j4 - - sudo make install - - cd .. + - git clone https://github.com/facebook/zstd + - cd zstd && make -j4 && sudo make install && cd .. + - git clone https://github.com/samtools/htslib + - cd htslib && autoheader && autoconf && ./configure && make -j 4 && sudo make install && cd .. + script: - git submodule update --recursive - - cd lib/third_party/zlib - - ./configure - - cd ../../../ - make -j 4 \ No newline at end of file diff --git a/README.md b/README.md index 2d038d2..2af382e 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,12 @@ [![Build Status](https://travis-ci.org/mklarqvist/tachyon.svg?branch=master)](https://travis-ci.org/mklarqvist/tachyon) -[![Release](https://img.shields.io/badge/Release-beta_0.1-blue.svg)](https://github.com/mklarqvist/Tachyon/releases) +[![Release](https://img.shields.io/badge/Release-beta_0.3.0-blue.svg)](https://github.com/mklarqvist/Tachyon/releases) [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)


-# Exploring population-scale sequence variant data -Tachyon, or `YON` for short, is an open source software library for storing and rapidly querying sequence variant data in an (optionally) lossless and bit-exact representation. It is completely compatible with BCF/VCF. It was developed with a focus on enabling fast experimentation and storage of population-scaled datasets. We have benchmarked Tachyon on population-scaled datasets up to 10 million whole-genome sequenced individuals (see [benchmarks](docs/benchmarks.md)). Tachyon grew out of the [Tomahawk][tomahawk] project for calculating genome-wide linkage-disequilibrium. +Tachyon, or `YON` for short, is an open source C++ software library for reading, writing, and manipulating sequence variant data in a lossless and bit-exact representation. It is completely compatible with BCF/VCF. It was developed with a focus on enabling fast experimentation and storage of population-scaled datasets. ## Highlights of Tachyon * **Self-indexing**: Tachyon always builds the best possible quad-tree, linear, and meta-index given the input data (irrespective of sorting). There are no external indices as data are stored in the file itself. @@ -15,334 +14,46 @@ Tachyon, or `YON` for short, is an open source software library for storing and * **Encryption**: Natively supports block-wise, field-wise, and entry-wise encryption with all commonly used encryption models and paradigms through [openssl][openssl]. * **Compression**: Tachyon files are generally many fold (in many cases many 10- to 100-folds) smaller than the current standard file-format. * **Field-specific layout**: In principle, Tachyon is implemented as a standard column-oriented management system with several layers of domain-specific heuristics providing fast and flexible data queries. This memory layout enables extremely rapid field-specific queries. +* **Performance**: The file-format is designed as independent blocks of data into independent byte streams. This approach is inherently amenable to paralellization through scatter-gather approaches on multiple cores or multiple machines. * **High-level API**: User-friendly C++/C API for quering, manipulating, and exploring sequence data with minimal programming experience * **Comaptibility**: We strive to provide API calls to return YON data streams to any of the current standard file-formats (`VCF` and `BCF`). This allows for immediate use of Tachyon without disrupting the existing ecosystem of tools. ---- +--- -## Project status -Tachyon is under active development and the specification and/or the API interfaces may change at any time! -**Commits may break functionality! THERE IS NO STABILITY PROMISE WHATSOEVER!** - -Current limitations imposed during development: -* Importing is restricted to `BCF` -* Output is restricted to `VCF`, `JSON`, and custom field slicing - ---- - -## Table of contents -- [Getting started](#getting-started) - - [Dependencies](#dependencies) - - [Building from source](#building-from-source) -- [Workflow example: using the CLI](#workflow-example-using-the-cli) - - [`import`: Importing `VCF`/`BCF`](#import-importing-vcfbcf) - - [`view`: Viewing, converting, and slicing `YON` files](#view-viewing-converting-and-slicing-yon-files) - - [Field-slicing](#field-slicing) - - [Searching for genomic regions](#searching-for-genomic-regions) - - [Annotating meta-data](#annotating-meta-data) -- [C++ API Examples](#c-api-examples) - - [Standard containers](#standard-containers) - - [Genotype containers / objects](#genotype-containers--objects) - - [Math objects](#math-objects) -- [Author](#author) -- [Acknowledgements](#acknowledgements) -- [License](#license) ---- - -## Getting started -### Dependencies -You will need to have installed the following dependencies: -* [zstd][zstd]: A compression library developed at Facebook -* [openssl][openssl]: An open-source library for encryption/decryption - -### Building from source -If the required external dependencies listed above are installed then building is trivial. Note the added `--recursive` flag to the clone request. This flag is required to additionally pull down the latest third-party dependencies. -```bash -git clone --recursive https://github.com/mklarqvist/tachyon -cd tachyon -make -``` -Tachyon comes bundled with several API-examples in the `lib_example` directory. They are built by default but should you want to rebuild them execute the command: -```bash -make examples -``` - -### Building without admin privilidges -If you have no super-user (`sudo`) powers required to install software on your machine: - -### Linux/MacOSX -```bash -git clone --recursive https://github.com/mklarqvist/tachyon -cd tachyon -# If you do NOT have ZSTD available -git clone https://github.com/facebook/zstd -cd zstd -make -cd .. -# If you do NOT have OpenSSL installed -git clone git://git.openssl.org/openssl.git -cd openssl -./config -make -cd .. -# Build Tachyon -make -``` -### MacOSX -Installation using [Homebrew](https://brew.sh/): +## Installation +For Ubuntu, Debian, and Mac systems, installation is easy: just run ```bash -brew update -# If you do NOT have OpenSSL installed -brew install openssl -# If you do NOT have ZSTD installed -brew install zstd -# Install Tachyon git clone --recursive https://github.com/mklarqvist/tachyon cd tachyon -make -``` - -## Workflow example: using the CLI -### `import`: Importing `VCF`/`BCF` -Import a `bcf` file to `yon` with a block-size of `-c` number of variants and/or `-C` number of base-pairs. If both `-c` and `-C` are set then the block breaks whenever either condition is satisfied. **Please note that importing VCF files are currently disabled** -```bash -tachyon import -i examples/example_dataset.bcf -o example_dataset.yon -c 2000 -``` - -Tachyon can protect your sensitive identifying information with high-grade encryption. By default, each data field is encrypted separately in each block with different keys using AES-256. Simply pass the `-e` flag and the best practices will be used. -```bash -tachyon import -i examples/example_dataset.bcf -o example_dataset.yon -c 2000 -e -``` -This will produce two output files: -* example_dataset.yon -* example_dataset.kyon - -### `view`: Viewing, converting, and slicing `YON` files -Printing a `yon` file as a bit-exact copy of the input `VCF` -```bash -tachyon view -i example_dataset.yon -H -``` -Output -``` -Contig110_arrow 672 . A T 525.07 basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889 GT:AD:DP:GQ:PL 0/0:1,0:1:3:0,3,38 1/1:0,2:.:6:76,6,0 ./.:0,0:0:.:0,0,0 0/1:2,2:.:43:58,0,43 0/0:1,0:1:3:0,3,24 0/0:4,0:4:12:0,12,141 ./.:0,0:0:.:0,0,0 0/0:1,0:1:3:0,3,29 0/1:1,4:.:19:147,0,19 0/1:3,2:.:49:71,0,49 0/0:5,0:5:0:0,0,81 ./.:1,0:1:.:0,0,0 ./.:0,0:0:.:0,0,0 0/0:6,0:6:18:0,18,192 1/1:0,2:.:6:58,6,0 0/0:1,0:1:3:0,3,11 ./.:1,0:1:.:0,0,0 ./.:0,0:0:.:0,0,0 0/1:3,2:.:58:58,0,63 ./.:0,0:0:.:0,0,0 0/0:4,0:4:12:0,12,134 0/0:3,0:3:0:0,0,44 0/0:3,0:3:9:0,9,90 ./.:0,0:0:.:0,0,0 ./.:0,0:0:.:0,0,0 ./.:0,0:0:.:0,0,0 0/0:2,0:2:6:0,6,53 0/1:1,2:.:19:62,0,19 0/0:3,0:3:9:0,9,84 0/0:2,0:2:6:0,6,49 0/1:1,2:.:19:74,0,19 0/0:1,0:1:3:0,3,38 ./.:2,0:2:.:0,0,0 ./.:0,0:0:.:0,0,0 0/0:2,0:2:6:0,6,65 ./.:0,0:0:.:0,0,0 -``` -We can check for bit-exact output from `tachyon` by comparing the output of the cryptographic hash function `SHA512` for `bcftools`. We drop the header `-H` as these two are different: both tools inject a timestamp and library versions each time a command is executed. -```bash -tachyon view -i example_dataset.yon -H | openssl dgst -sha512 -``` -``` -4c94ee35fa3509935e5ea63f6da9b39dc94b1073b551c7d4d56bca7666a6872ad629b6f91f43a8dc45b306c0b0bbb2f414fb811ed45c7e6434c3570b2e448c68 -``` -```bash -bcftools view example_dataset.bcf -H | openssl dgst -sha512 -``` -``` -4c94ee35fa3509935e5ea63f6da9b39dc94b1073b551c7d4d56bca7666a6872ad629b6f91f43a8dc45b306c0b0bbb2f414fb811ed45c7e6434c3570b2e448c68 -``` - -Listing only site-specific information and `INFO` fields: -```bash -tachyon view -i example_dataset.yon -GH -``` -Output -``` -Contig110_arrow 672 . A T 525.07 basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889 -``` - -### Field-slicing -Listing a specific `INFO` field with output data still adhering to the `VCF` specification: -```bash -tachyon view -i example_dataset.yon -GH -f "INFO=AC" -``` -Output -``` -Contig110_arrow 672 . . . . basic_filtering AC=10 -``` - -Add `REF` and `ALT` to the output -```bash -tachyon view -i example_dataset.yon -GH -f "INFO=AC;REF;ALT" -``` -Output +sudo ./install.sh ``` -Contig110_arrow 19575 . A C . basic_filtering AC=2 -``` - -Listing this output in a custom tab-delimited format +Note the added `--recursive` flag to the clone request. This flag is required to additionally pull down the latest third-party dependencies. The install.sh file depends extensively on apt-get, so it is unlikely to run without extensive modifications on non-Debian-based systems. +If you do not have super-user privileges required to install new packages on your system then run ```bash -tachyon view -i example_dataset.yon -GH -f "INFO=AC;REF;ALT" -c -d'\t' -``` -Output (first five lines) -``` -A T 10 -A T 36 -TG T 34 -G A 2 -G C 2 +./install.sh local ``` +In this situation, all required dependencies are downloaded and built in the current directory. This approach will require additional effort if you intend to move the compiled libraries to a new directory. -Listing `CHROM`, `POS`, and all `INFO` fields -```bash -tachyon view -i example_dataset.yon -GH -f "CHROM;POS;INFO" -c -d'|' -``` -Output -``` -Contig110_arrow|672|10|0.217|46|0.967|72|0.8113|54.73|-0.0525|11|0.239|31.05|1.38|18.11|-0.431|5.889 -``` +## Documentation -Listing all available `INFO` fields and the `FORMAT` fields `DP` and `PL` in VCF -```bash -tachyon view -i example_dataset.yon -f "chrom;pos;ref;alt;info;format=dp,pl" -F vcf -H -``` -Output -``` -Contig110_arrow 672 . A T . basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889 DP:PL 1:0,3,38 .:76,6,0 0:0,0,0 .:58,0,43 1:0,3,24 4:0,12,141 0:0,0,0 1:0,3,29 .:147,0,19 .:71,0,49 5:0,0,81 1:0,0,0 0:0,0,0 6:0,18,192 .:58,6,0 1:0,3,11 1:0,0,0 0:0,0,0 .:58,0,63 0:0,0,0 4:0,12,134 3:0,0,44 3:0,9,90 0:0,0,0 0:0,0,0 0:0,0,0 2:0,6,53 .:62,0,19 3:0,9,84 2:0,6,49 .:74,0,19 1:0,3,38 2:0,0,0 0:0,0,0 2:0,6,65 0:0,0,0 -``` +* Overview. +* [Building and installing](docs/building.md) +* [Getting started](docs/getting_started.md) +* [Summary of example programs](docs/example_programs.md). +* [Performance benchmarks](docs/benchmarks.md) -Listing all available `INFO` fields and the `FORMAT` fields `DP` and `PL` in JSON -```bash -tachyon view -i example_dataset.yon -f "chrom;pos;ref;alt;info;format=dp,pl" -F JSON -``` -Output -```json -"block": { - "obj-0": { - "contig": "Contig110_arrow", - "position": 672, - "ref": "A", - "alt": "T", - "INFO-AC": 10, - "INFO-AF": 0.217, - "INFO-AN": 46, - "INFO-BaseQRankSum": 0.967, - "INFO-DP": 72, - "INFO-ExcessHet": 0.8113, - "INFO-FS": 54.73, - "INFO-InbreedingCoeff": -0.0525, - "INFO-MLEAC": 11, - "INFO-MLEAF": 0.239, - "INFO-MQ": 31.05, - "INFO-MQRankSum": 1.38, - "INFO-QD": 18.11, - "INFO-ReadPosRankSum": -0.431, - "INFO-SOR": 5.889, - "FORMAT-DP": [1, null, 0, null, 1, 4, 0, 1, null, null, 5, 1, 0, 6, null, 1, 1, 0, null, 0, 4, 3, 3, 0, 0, 0, 2, null, 3, 2, null, 1, 2, 0, 2, 0], - "FORMAT-PL": [ - [0, 3, 38], - [76, 6, 0], - [0, 0, 0], - [58, 0, 43], - [0, 3, 24], - [0, 12, 141], - [0, 0, 0], - [0, 3, 29], - [147, 0, 19], - [71, 0, 49], - [0, 0, 81], - [0, 0, 0], - [0, 0, 0], - [0, 18, 192], - [58, 6, 0], - [0, 3, 11], - [0, 0, 0], - [0, 0, 0], - [58, 0, 63], - [0, 0, 0], - [0, 12, 134], - [0, 0, 44], - [0, 9, 90], - [0, 0, 0], - [0, 0, 0], - [0, 0, 0], - [0, 6, 53], - [62, 0, 19], - [0, 9, 84], - [0, 6, 49], - [74, 0, 19], - [0, 3, 38], - [0, 0, 0], - [0, 0, 0], - [0, 6, 65], - [0, 0, 0] - ] - }, -``` +### Contributing -Listing all available `INFO` fields and the `FORMAT` fields `DP` and `PL` in a custom output format with a tab-delimiter. No restrictions are placed on the output format -```bash -tachyon view -i example_dataset.yon -f "pos;info;format=dp,pl" -F CUSTOM -cd'\t' -``` -Output -``` -672 10 0.217 46 0.967 72 0.8113 54.73 -0.0525 11 0.239 31.05 1.38 18.11 -0.431 5.889 DP:PL 1:0,3,38 .:76,6,0 0:0,0,0 .:58,0,43 1:0,3,24 4:0,12,141 0:0,0,01:0,3,29 .:147,0,19 .:71,0,49 5:0,0,81 1:0,0,0 0:0,0,0 6:0,18,192 .:58,6,0 1:0,3,11 1:0,0,0 0:0,0,0 .:58,0,63 0:0,0,0 4:0,12,134 3:0,0,44 3:0,9,90 0:0,0,0 0:0,0,00:0,0,0 2:0,6,53 .:62,0,19 3:0,9,84 2:0,6,49 .:74,0,19 1:0,3,38 2:0,0,0 0:0,0,0 2:0,6,65 0:0,0,0 -``` +Interested in contributing? Fork and submit a pull request and it will be reviewed. -Listing all available `INFO` fields and the `FORMAT` fields `DP` and `PL` in a custom output format with a tab-delimiter and -all `FORMAT` data are printed as vectors of samples instead of per-sample -```bash -tachyon view -i example_dataset.yon -f "chrom;pos;ref;alt;info;format=dp,pl" -F CUSTOM -cd'\t' -V -``` -Output -``` -Contig110_arrow 672 A T 10 0.217 46 0.967 72 0.8113 54.73 -0.0525 11 0.239 31.05 1.38 18.11 -0.431 5.889 DP:PL 1,.,0,.,1,4,0,1,.,.,5,1,0,6,.,1,1,0,.,0,4,3,3,0,0,0,2,.,3,2,.,1,2,0,2,0 0,3,38,76,6,0,0,0,0,58,0,43,0,3,24,0,12,141,0,0,0,0,3,29,147,0,19,71,0,49,0,0,81,0,0,0,0,0,0,0,18,192,58,6,0,0,3,11,0,0,0,0,0,0,58,0,63,0,0,0,0,12,134,0,0,44,0,9,90,0,0,0,0,0,0,0,0,0,0,6,53,62,0,19,0,9,84,0,6,49,74,0,19,0,3,38,0,0,0,0,0,0,0,6,65,0,0,0 -``` - -Output all available `INFO` fields and the `FORMAT` field `DP` and `FILTERS` -```bash -tachyon view -i example_dataset.yon -f "chrom;pos;ref;alt;info;format=dp;filter" -F CUSTOM -cd';' -V -``` - -### Searching for genomic regions -Slicing intervals either as a contig, contig with a single position, or interval with a contig: -```bash -tachyon view -i example_dataset.yon -r "Contig110_arrow" -``` -```bash -tachyon view -i example_dataset.yon -r "Contig110_arrow:672" -``` -```bash -tachyon view -i example_dataset.yon -r "Contig110_arrow:672-1500" -``` +### Support +We are actively developing Tachyon and are always interested in improving its quality. If you run into an issue, please report the problem on our Issue tracker. Be sure to add enough detail to your report that we can reproduce the problem and address it. We have not reached version 1.0 and as such the specification and/or the API interfaces may change. -### Annotating meta-data -It is possible to annotate data with a series of `INFO` fields computed directly from the genotypic vectors or from the reference/alternative allele data: +### Version +This is Tachyon 0.3.0. Tachyon follows [semantic versioning](https://semver.org/). -| Field | Length | Type | Description | -|-----------------|--------|---------|---------------------------------------------| -| `FS_A` | `A` | `Float` | PHRED-scaled Fisher's exact test P-value for allelic strand bias | -| `AN` | `A` | `Integer` | Total number of alleles in called genotypes | -| `NM` | `A` | `Integer` | Total number of missing alleles in called genotypes | -| `NPM` | `A` | `Integer` | Total number of samples with non-reference (compared to largest) ploidy | -| `AC` | `A` | `Integer` | Total number of alleles | -| `AC_FWD` | `A` | `Integer` | Total number of alleles on the *forward* strand | -| `AC_REV` | `A` | `Integer` | Total number of alleles on the *reverse* strand | -| `AF` | `A` | `Float` | Allele frequency of allele | -| `HWE_P` | `A` | `Float` | Hardy-Weinberg equilibrium P-value | -| `VT` | `A` | `String` | Variant classification (SNP, MNP, INDEL, CLUMPED, SV, UNKNOWN) | -| `MULTI_ALLELIC` | 0 | `Flag` | Indicates if a site is multi-allelic (number of alternative alleles > 1) | -| `F_PIC` | `A` | `Float` | Population inbreeding coefficient (F-statistic) | - -The contingency table, or matrix, for the Fisher's exact test (`FS_A` ) for strand bias looks like this: - -| | Target allele | *Not* target allele | -|----------------|---------------|-------------------| -| Forward strand | A | B | -| Reverse strand | C | D | - -In the biallelic case, only one P-value is reported becuase of symmetry. If the site is not biallelic then each individual allele is computed separately. - -Using the example dataset, we can compute those fields that are not already available by passing the flag `-X` - -```bash -tachyon view -i example_dataset.yon -GHX -``` -Output -``` -Contig110_arrow 672 . A T 525.07 basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889;FS_A=11.5091,0;NM=26;AC_FWD=21,2;AC_REV=15,8;HWE_P=0.25072;VT=SNP -``` - -## C++ API Examples -We provide several API examples in the `lib_example` directory. Get started by `make examples`: this requires you to have compiled the shared library first. +### History +Tachyon grew out of the [Tomahawk][tomahawk] project for calculating genome-wide linkage-disequilibrium. ### Author Marcus D. R. Klarqvist () @@ -357,7 +68,6 @@ Wellcome Trust Sanger Institute ### License Tachyon is licensed under [MIT](LICENSE) - [openssl]: https://www.openssl.org/ [zstd]: https://github.com/facebook/zstd [tomahawk]: https://github.com/mklarqvist/tomahawk diff --git a/examples/1kgp3_yon_bcf.jpeg b/docs/1kgp3_yon_bcf.jpeg similarity index 100% rename from examples/1kgp3_yon_bcf.jpeg rename to docs/1kgp3_yon_bcf.jpeg diff --git a/docs/benchmarks.md b/docs/benchmarks.md index bfd090b..0409408 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -10,7 +10,7 @@ We simulated haplotypes using [msprime][msprime] for a 50 megabase region for va | 685,363 | 250,000 | 12.64719 | 1.90681 | 1927.011 | 24.671 | 1028.046 | 4.745 | | 719,754 | 500,000 | 25.08209 | 3.04404 | 4139.424 | 45.247 | 2241.756 | 11.706 | -![screenshot](examples/sim_50gbp.jpeg) +![screenshot](sim_50gbp.jpeg) ### Real datasets @@ -41,7 +41,7 @@ The following table shows data for the 1000 Genomes Project Phase 3 release (2,5 | 21 | 180.55 | 5664.7 | 50.263 | 236.30 | 112.70 | 31.374 | 23.972 | | 22 | 177.18 | 5654.2 | 51.468 | 229.95 | 109.86 | 31.912 | 24.589 | -![screenshot](examples/1kgp3_yon_bcf.jpeg) +![screenshot](1kgp3_yon_bcf.jpeg) The following table shows data for the Haplotype Reference Consortium (32,488 whole-genome sequenced samples) | Contig | BCF-compressed | BCF-uncompressed | YON-compressed | YON-uncompressed | YON_fold | BCF_fold | Uncompressed_fold | diff --git a/docs/building.md b/docs/building.md new file mode 100644 index 0000000..9d389c2 --- /dev/null +++ b/docs/building.md @@ -0,0 +1,37 @@ +# Installing Tachyon +## Dependencies +You will need to have installed the following dependencies: +* [zstd][zstd]: A compression library developed at Facebook +* [openssl][openssl]: An open-source library for encryption/decryption +* [htslib][htslib]: C library for high-throughput sequencing data formats + +## Building from source +If the required external dependencies listed above are installed then building is trivial. Note the added `--recursive` flag to the clone request. This flag is required to additionally pull down the latest third-party dependencies. +```bash +git clone --recursive https://github.com/mklarqvist/tachyon +cd tachyon +make +``` +Tachyon comes bundled with several API-examples in the `lib_example` directory. They are built by default but should you want to rebuild them execute the command: +```bash +make examples +``` + +## Installation +For Ubuntu, Debian, and Mac systems, installation is easy: just run +```bash +git clone --recursive https://github.com/mklarqvist/tachyon +cd tachyon +sudo ./install.sh +``` +Note the added `--recursive` flag to the clone request. This flag is required to additionally pull down the latest third-party dependencies. The install.sh file depends extensively on apt-get, so it is unlikely to run without extensive modifications on non-Debian-based systems. +If you do not have super-user privileges required to install new packages on your system then run +```bash +./install.sh local +``` +In this situation, all required dependencies are downloaded and built in the current directory. This approach will require additional effort if you intend to move the compiled libraries to a new directory. + +[openssl]: https://www.openssl.org/ +[zstd]: https://github.com/facebook/zstd +[tomahawk]: https://github.com/mklarqvist/tomahawk +[htslib]: https://github.com/samtools/htslib \ No newline at end of file diff --git a/docs/example_programs.md b/docs/example_programs.md new file mode 100644 index 0000000..4c8e7dd --- /dev/null +++ b/docs/example_programs.md @@ -0,0 +1,26 @@ +# Example programs +After running `make examples`, the executables versions of the example programs are located in `lib_example`. You can +run these examples using the bundled example dataset located at `examples/example_dataset.yon`. For example, to run `meta_container` +you would have to run a command like +```bash +lib_example/meta_container examples/example_dataset.yon +``` + +Here is a summary of example programs included with Tachyon: +* `calculate_depth_profile ` + If the input tachyon file has the FORMAT field DP set, then the output will be a matrix of average, standard deviation, minimum and maximum, and total number of non-zero depth for each individual. This example program demonstrates the power of the `SummaryStatistics` objects. +* `format_container_balanced ` + If the input tachyon file has the FORMAT field GQ set, then the output will print a VCF-string for each variant site. This example uses the balanced FORMAT container that contains + empty entries to match the number of variants in a block. +* `format_container_raw ` + If the input tachyon file has the FORMAT field PGT set, then the output will print a VCF-string for each site that has data. This example uses the raw FORMAT container that, unlike its balanced counterpart, do not store empty records. This means that the container has no knowledge of what records go with what sites. This is generally useful when site-information is not directly required. This example also demonstrates how to use the `FormatContainer` with strings. +* `genotype_container ` + If the input tachyon file has genotypes available, then this example will demonstrate the various internal representations of genotype containers. 1) Return the literal encoded objects: this is useful for most low-level operations but require considerable technical insight; 2) Vector of genotypes for a site but in, potentially, permuted order: this is useful when ordering is not important. Retrieving permuted genotypic vectors are much more efficient that retrieving unpermuted ones; 3) Vector of genotypes for a site in original (unpermuted) order. This example will first print out the number of elements in each of these three containers and then print the content of each. +* `genotype_likelihoods ` + If the input tachyon file has the FORMAT field PL set, this example will print out the genotype likelihoods for each genotype. This example demonstrates the use of a floating value `FormatContainer`. +* `info_container_balance_comparison ` + This example program will print out the difference between two `InfoContainer` storing the same data where one is balanced and one is not. The input file has to have the INFO field InbreedingCoeff set. +* `info_container_balanced ` + If the input tachyon file has the INFO field DP set, then this program will print out the sum depth at each site. Because this is a balanced container the output may contain empty values +* `meta_container ` + This example demonstrates the use of the `MetaContainer` that stores site-specific and required internal data. This example will print out VCF-strings for the site-specific information for each site. \ No newline at end of file diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 0000000..f5920a7 --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,138 @@ +# Getting started with Tachyon + +## Table of contents +- [Workflow example: using the CLI](#workflow-example-using-the-cli) + - [`import`: Importing `VCF`/`BCF`](#import-importing-vcfbcf) + - [`view`: Viewing, converting, and slicing `YON` files](#view-viewing-converting-and-slicing-yon-files) + - [Field-slicing](#field-slicing) + - [Searching for genomic regions](#searching-for-genomic-regions) + - [Annotating meta-data](#annotating-meta-data) +- [C++ API Examples](#c-api-examples) + +--- + +## Workflow example: using the CLI +### `import`: Importing `VCF`/`BCF` +Import a `bcf` file to `yon` with a block-size of `-c` number of variants and/or `-C` number of base-pairs. If both `-c` and `-C` are set then the block breaks whenever either condition is satisfied. Compression levels can be adjusted (`-L`) in the range 0 to 20 and corresponds to worse to better compression at a trade-off between compression time and file size. The decompression times virtually unaffected by the compression level chosen. +```bash +tachyon import -i examples/example_dataset.bcf -o example_dataset.yon -c 2000 +``` + +Tachyon can protect your sensitive identifying information with high-grade encryption. By default, each data field is encrypted with a unique key in each block using [AES-256](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard). Simply pass the `-e` flag and the best practices will be used. +```bash +tachyon import -i examples/example_dataset.bcf -o example_dataset.yon -c 2000 -e +``` +This will produce two output files: +* example_dataset.yon +* example_dataset.kyon + +### `view`: Viewing, converting, and slicing `YON` files +Printing a `yon` file as a bit-exact copy of the input `VCF` +```bash +tachyon view -i example_dataset.yon -H +``` +Output +``` +Contig110_arrow 672 . A T 525.07 basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889 GT:AD:DP:GQ:PL 0/0:1,0:1:3:0,3,38 1/1:0,2:.:6:76,6,0 ./.:0,0:0:.:0,0,0 0/1:2,2:.:43:58,0,43 0/0:1,0:1:3:0,3,24 0/0:4,0:4:12:0,12,141 ./.:0,0:0:.:0,0,0 0/0:1,0:1:3:0,3,29 0/1:1,4:.:19:147,0,19 0/1:3,2:.:49:71,0,49 0/0:5,0:5:0:0,0,81 ./.:1,0:1:.:0,0,0 ./.:0,0:0:.:0,0,0 0/0:6,0:6:18:0,18,192 1/1:0,2:.:6:58,6,0 0/0:1,0:1:3:0,3,11 ./.:1,0:1:.:0,0,0 ./.:0,0:0:.:0,0,0 0/1:3,2:.:58:58,0,63 ./.:0,0:0:.:0,0,0 0/0:4,0:4:12:0,12,134 0/0:3,0:3:0:0,0,44 0/0:3,0:3:9:0,9,90 ./.:0,0:0:.:0,0,0 ./.:0,0:0:.:0,0,0 ./.:0,0:0:.:0,0,0 0/0:2,0:2:6:0,6,53 0/1:1,2:.:19:62,0,19 0/0:3,0:3:9:0,9,84 0/0:2,0:2:6:0,6,49 0/1:1,2:.:19:74,0,19 0/0:1,0:1:3:0,3,38 ./.:2,0:2:.:0,0,0 ./.:0,0:0:.:0,0,0 0/0:2,0:2:6:0,6,65 ./.:0,0:0:.:0,0,0 +``` +We can check for bit-exact output from `tachyon` by comparing the output of the cryptographic hash function `SHA512` for `bcftools`. We drop the header `-H` as these two are different: both tools inject a timestamp and library versions each time a command is executed among other things. +```bash +tachyon view -i example_dataset.yon -H | openssl dgst -sha512 +``` +``` +4c94ee35fa3509935e5ea63f6da9b39dc94b1073b551c7d4d56bca7666a6872ad629b6f91f43a8dc45b306c0b0bbb2f414fb811ed45c7e6434c3570b2e448c68 +``` +```bash +bcftools view example_dataset.bcf -H | openssl dgst -sha512 +``` +``` +4c94ee35fa3509935e5ea63f6da9b39dc94b1073b551c7d4d56bca7666a6872ad629b6f91f43a8dc45b306c0b0bbb2f414fb811ed45c7e6434c3570b2e448c68 +``` + +Listing only site-specific information and `INFO` fields: +```bash +tachyon view -i example_dataset.yon -GH +``` +Output +``` +Contig110_arrow 672 . A T 525.07 basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889 +``` + +### Field-slicing +Listing a specific `INFO` field with output data still adhering to the `VCF` specification: +```bash +tachyon view -i example_dataset.yon -GH -f "INFO=AC" +``` +Output +``` +Contig110_arrow 672 . . . . basic_filtering AC=10 +``` + +Add `REF` and `ALT` to the output +```bash +tachyon view -i example_dataset.yon -GH -f "INFO=AC;REF;ALT" +``` +Output +``` +Contig110_arrow 19575 . A C . basic_filtering AC=2 +``` + +Listing all available `INFO` fields and the `FORMAT` fields `DP` and `PL` in VCF +```bash +tachyon view -i example_dataset.yon -f "chrom;pos;ref;alt;info;format=dp,pl" -H -O vcf +``` +Output +``` +Contig110_arrow 672 . A T . basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889 DP:PL 1:0,3,38 .:76,6,0 0:0,0,0 .:58,0,43 1:0,3,24 4:0,12,141 0:0,0,0 1:0,3,29 .:147,0,19 .:71,0,49 5:0,0,81 1:0,0,0 0:0,0,0 6:0,18,192 .:58,6,0 1:0,3,11 1:0,0,0 0:0,0,0 .:58,0,63 0:0,0,0 4:0,12,134 3:0,0,44 3:0,9,90 0:0,0,0 0:0,0,0 0:0,0,0 2:0,6,53 .:62,0,19 3:0,9,84 2:0,6,49 .:74,0,19 1:0,3,38 2:0,0,0 0:0,0,0 2:0,6,65 0:0,0,0 +``` + +### Searching for genomic regions +Slicing intervals either as a contig, contig with a single position, or interval with a contig: +```bash +tachyon view -i example_dataset.yon -r "Contig110_arrow" +``` +```bash +tachyon view -i example_dataset.yon -r "Contig110_arrow:672" +``` +```bash +tachyon view -i example_dataset.yon -r "Contig110_arrow:672-1500" +``` + +### Annotating meta-data +It is possible to annotate data with a series of `INFO` fields computed directly from the genotypic vectors or from the reference/alternative allele data: + +| Field | Length | Type | Description | +|-----------------|--------|---------|---------------------------------------------| +| `FS_A` | `A` | `Float` | PHRED-scaled Fisher's exact test P-value for allelic strand bias | +| `AN` | `1` | `Integer` | Total number of alleles in called genotypes | +| `NM` | `1` | `Integer` | Total number of missing alleles in called genotypes | +| `NPM` | `1` | `Integer` | Total number of samples with non-reference (compared to largest) ploidy | +| `AC` | `A` | `Integer` | Total number of alleles | +| `AC_P` | `A` | `Integer` | Total number of alleles each strand | +| `AF` | `A` | `Float` | Allele frequency of allele | +| `HWE_P` | `1` | `Float` | Hardy-Weinberg equilibrium P-value | +| `VT` | `A` | `String` | Variant classification (SNP, MNP, INDEL, CLUMPED, SV, UNKNOWN) | +| `MULTI_ALLELIC` | 0 | `Flag` | Indicates if a site is multi-allelic (number of alternative alleles > 1) | +| `F_PIC` | `1` | `Float` | Population inbreeding coefficient (F-statistic) | + +The contingency table, or matrix, for the Fisher's exact test (`FS_A` ) for strand bias looks like this: + +| | Target allele | *Not* target allele | +|----------------|---------------|-------------------| +| Forward strand | A | B | +| Reverse strand | C | D | + +In the biallelic case, only one P-value is reported becuase of symmetry. If the site is not biallelic then each individual allele is computed separately. + +Using the example dataset, we can compute those fields that are not already available by passing the flag `-X` + +```bash +tachyon view -i example_dataset.yon -GHX +``` +Output +``` +Contig110_arrow 672 . A T 525.07 basic_filtering AC=10;AF=0.217;AN=46;BaseQRankSum=0.967;DP=72;ExcessHet=0.8113;FS=54.73;InbreedingCoeff=-0.0525;MLEAC=11;MLEAF=0.239;MQ=31.05;MQRankSum=1.38;QD=18.11;ReadPosRankSum=-0.431;SOR=5.889;FS_A=11.5091,0;NM=26;AC_FWD=21,2;AC_REV=15,8;HWE_P=0.25072;VT=SNP +``` + +## C++ API Examples +We provide several API examples in the `lib_example` directory. Get started by `make examples`: this requires you to have compiled the shared library first. diff --git a/examples/hrc_bcf_yon.jpeg b/docs/hrc_bcf_yon.jpeg similarity index 100% rename from examples/hrc_bcf_yon.jpeg rename to docs/hrc_bcf_yon.jpeg diff --git a/examples/sim_50gbp.jpeg b/docs/sim_50gbp.jpeg similarity index 100% rename from examples/sim_50gbp.jpeg rename to docs/sim_50gbp.jpeg diff --git a/examples/1kgp3_chr20_ibs.png b/examples/1kgp3_chr20_ibs.png deleted file mode 100644 index 9c16e6d..0000000 Binary files a/examples/1kgp3_chr20_ibs.png and /dev/null differ diff --git a/examples/import/msprime_sim_50gb_1k_p1.bcf b/examples/import/msprime_sim_50gb_1k_p1.bcf new file mode 100644 index 0000000..6ef2630 Binary files /dev/null and b/examples/import/msprime_sim_50gb_1k_p1.bcf differ diff --git a/examples/import/msprime_sim_50gb_1k_p2.bcf b/examples/import/msprime_sim_50gb_1k_p2.bcf new file mode 100644 index 0000000..dcc6bd1 Binary files /dev/null and b/examples/import/msprime_sim_50gb_1k_p2.bcf differ diff --git a/examples/import/msprime_sim_50gb_1k_p3.bcf b/examples/import/msprime_sim_50gb_1k_p3.bcf new file mode 100644 index 0000000..51ca0d7 Binary files /dev/null and b/examples/import/msprime_sim_50gb_1k_p3.bcf differ diff --git a/examples/import/msprime_sim_50gb_1k_p4.bcf b/examples/import/msprime_sim_50gb_1k_p4.bcf new file mode 100644 index 0000000..3651c10 Binary files /dev/null and b/examples/import/msprime_sim_50gb_1k_p4.bcf differ diff --git a/examples/import/msprime_sim_50gb_1k_p5.bcf b/examples/import/msprime_sim_50gb_1k_p5.bcf new file mode 100644 index 0000000..9dc41e4 Binary files /dev/null and b/examples/import/msprime_sim_50gb_1k_p5.bcf differ diff --git a/examples/import/msprime_sim_50gb_1k_p6.bcf b/examples/import/msprime_sim_50gb_1k_p6.bcf new file mode 100644 index 0000000..bf193a1 Binary files /dev/null and b/examples/import/msprime_sim_50gb_1k_p6.bcf differ diff --git a/examples/yon_tstv_1kgp3.jpeg b/examples/yon_tstv_1kgp3.jpeg deleted file mode 100644 index 697d4f7..0000000 Binary files a/examples/yon_tstv_1kgp3.jpeg and /dev/null differ diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..ecbe86c --- /dev/null +++ b/install.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# ################################################################ +# Copyright (C) 2017-present Genome Research Ltd. +# Author: Marcus D. R. Klarqvist +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ################################################################ + +function note_build_stage { + echo "========== [$(date)] Stage '${1}' starting" +} + +if [[ "$1" == "local" ]]; then +note_build_stage "Building zstd..." +if [ ! -d zstd ]; then +git clone https://github.com/facebook/zstd +fi +cd zstd +if [ ! -f lib/libzstd.so ]; then + make -j$(nproc) +else + echo "ZSTD already built! Skipping..." +fi +cd .. + +note_build_stage "Building OpenSSL..." +if [ ! -d openssl ]; then +git clone https://github.com/openssl/openssl.git +fi +cd openssl +if [ ! -f libssl.so ]; then + ./config + make -j$(nproc) +else + echo "OpenSSL already built! Skipping..." +fi +cd .. + +note_build_stage "Building curl..." +if [ ! -d curl-7.61.0 ]; then +wget https://dl.uxnr.de/mirror/curl/curl-7.61.0.tar.gz +tar -xzvf curl-7.61.0.tar.gz +fi +cd curl-7.61.0 +if [ ! -f lib/.libs/libcurl.so ]; then + CPPFLAGS="-I${PWD}/../openssl/include" LDFLAGS="-L${PWD}/../openssl" ./configure + make -j$(nproc) +else + echo "curl already built! Skipping..." +fi +cd .. + +note_build_stage "Building htslib" +if [ ! -d htslib ]; then +git clone https://github.com/samtools/htslib.git +fi +cd htslib +if [ ! -f htslib.so ]; then + autoheader && autoconf && ./configure CPPFLAGS="-I/usr/local/include/" LDFLAGS="-L/usr/local/lib/" && make -j$(nproc) +else + echo "htslib already built! Skipping..." +fi +cd .. + +else # Install with sudo +if [ "$(uname)" == "Darwin" ]; then + # Update package list + ################################################################################ + brew update + # Install generic dependencies + ################################################################################ + note_build_stage "Install openssl" + brew install openssl + note_build_stage "Install zstd" + brew install zstd + note_build_stage "Install htslib" + brew install htslib +else + # Update package list + ################################################################################ + note_build_stage "Update package list" + sudo -H apt-get -qq -y update + + # Install generic dependencies + ################################################################################ + note_build_stage "Update misc. dependencies" + sudo -H apt-get -y install pkg-config zip g++ zlib1g-dev unzip curl git lsb-release liblz4-dev + + # Install htslib dependencies + ################################################################################ + note_build_stage "Install htslib dependencies" + sudo -H apt-get -y install libssl-dev libcurl4-openssl-dev liblz-dev libbz2-dev liblzma-dev + + # Install from Github + ################################################################################ + CURDIR=`echo $PWD` + TMPDIR=`mktemp -d -t` + cd ${TMPDIR} + # Install zstd + ################################################################################ + note_build_stage "Install zstd" + git clone https://github.com/facebook/zstd + cd zstd && make -j$(nproc) && sudo make install + cd .. + # Install htslib + ################################################################################ + note_build_stage "Install htslib" + git clone https://github.com/samtools/htslib.git + cd htslib + autoheader && autoconf && ./configure && make -j$(nproc) && sudo make install + cd ${CURDIR} +fi +fi + +# Install Tachyon +################################################################################ +note_build_stage "Building Tachyon" +make clean; make -j$(nproc) \ No newline at end of file diff --git a/lib/algorithm/OpenHashTable.h b/lib/algorithm/OpenHashTable.h index c6d7261..bcf97de 100644 --- a/lib/algorithm/OpenHashTable.h +++ b/lib/algorithm/OpenHashTable.h @@ -42,7 +42,7 @@ class HashTable{ inline const U32& capacity(void) const{return(this->__size);} inline const U32& size(void) const{return(this->__occupied);} - inline const bool empty(void) const{return(this->__occupied == 0);} + inline bool empty(void) const{return(this->__occupied == 0);} inline value_type& operator[](const U32 position){return(*this->__entries[position]);} inline const value_type& operator[](const U32 position) const{return(*this->__entries[position]);} diff --git a/lib/algorithm/compression/compression_container.h b/lib/algorithm/compression/compression_container.h index eccb5ec..a925848 100644 --- a/lib/algorithm/compression/compression_container.h +++ b/lib/algorithm/compression/compression_container.h @@ -1,14 +1,14 @@ #ifndef COMPRESSIONCONTAINER_H_ #define COMPRESSIONCONTAINER_H_ -#include "algorithm/permutation/permutation_manager.h" +#include "core/genotypes.h" #include "containers/data_container.h" namespace tachyon{ namespace algorithm{ /**< Lower bounds threshold in fold-change for compression to be kept */ -#define MIN_COMPRESSION_FOLD 1.1 +#define MIN_COMPRESSION_FOLD 1.05 /** * Permute bits from a byte-stream of U32 into target @@ -20,45 +20,50 @@ namespace algorithm{ * @param destination Destination char* buffer of permuted data * @return TRUE if passing or FALSE otherwise */ -inline const U32 permuteIntBits(const char* const data, - const U32 size, - char* destination) +inline U32 permuteIntBits(const char* const data, + const uint32_t size, + char* destination) { if(size == 0) return 0; - U32 internal_size = size + (32-size%32); // Balance bytes + // Balance the number of bytes in the output + // byte stream to be divisible by 32. Assert + // that this is true or the procedure fails. + const uint32_t internal_size = size + (32-size % 32); // Balance bytes assert(internal_size % 32 == 0); - BYTE* dest = reinterpret_cast(destination); + // Interpret the dst target as unsigned char + // to prevent annoying signedness. + uint8_t* dest = reinterpret_cast(destination); memset(dest, 0, internal_size); // Set all bytes to 0 - const BYTE* const d = reinterpret_cast(data); // Recast as uchar - BYTE* target[32]; // Bucket pointers - const U32 partition_size = internal_size / 32; // Partition size + const uint8_t* const d = reinterpret_cast(data); // Recast as uchar + uint8_t* target[32]; // Bucket pointers + const uint32_t partition_size = internal_size / 32; // Partition size // Assign a pointer to each bucket - for(U32 i = 0; i < 32; ++i) + for(uint32_t i = 0; i < 32; ++i) target[31-i] = &dest[partition_size*i]; - U32 k = 0; U32 p = 0; - // Foreach U32 - // Update position K for each element - // When K reaches position 7 then reset to 0 - for(U32 i = 0; i + 4 < internal_size; i+=4, ++k){ + uint32_t k = 0, p = 0; + // Iterate over the data and update position K for + // each element. When K reaches position 7 then reset + // to 0. + for(uint32_t i = 0; i + 4 < internal_size; i+=4, ++k){ if(k == 8){ k = 0; ++p; } // Foreach bit in U32 // Update target T at byte position P with bit J at position K - for(U32 j = 0; j < 8; ++j) target[j+ 0][p] |= ((d[i] & (1 << j)) >> j) << k; - for(U32 j = 0; j < 8; ++j) target[j+ 8][p] |= ((d[i+1] & (1 << j)) >> j) << k; - for(U32 j = 0; j < 8; ++j) target[j+16][p] |= ((d[i+2] & (1 << j)) >> j) << k; - for(U32 j = 0; j < 8; ++j) target[j+24][p] |= ((d[i+3] & (1 << j)) >> j) << k; + for(uint32_t j = 0; j < 8; ++j) target[j+ 0][p] |= ((d[i] & (1 << j)) >> j) << k; + for(uint32_t j = 0; j < 8; ++j) target[j+ 8][p] |= ((d[i+1] & (1 << j)) >> j) << k; + for(uint32_t j = 0; j < 8; ++j) target[j+16][p] |= ((d[i+2] & (1 << j)) >> j) << k; + for(uint32_t j = 0; j < 8; ++j) target[j+24][p] |= ((d[i+3] & (1 << j)) >> j) << k; } return internal_size; } -inline const U32 unpermuteIntBits(char* data, - const U32 size, - char* destination) +inline U32 unpermuteIntBits(char* data, + const U32 size, + char* destination) { if(size == 0) return 0; //U32 internal_size = size + (32-size%32); // Balance bytes @@ -76,13 +81,6 @@ inline const U32 unpermuteIntBits(char* data, for(U32 i = 0; i < 32; ++i) target[31-i] = &temp[partition_size*i]; - /* - for(U32 i = 0; i < size; ++i){ - std::cerr << (int)data[i] << ' '; - } - std::cerr << std::endl; - */ - U32 k = 0; U32 p = 0; // Foreach U32 // Update position K for each element @@ -90,18 +88,17 @@ inline const U32 unpermuteIntBits(char* data, for(U32 i = 0; i < n_entries; ++i, ++k){ if(k == 8){ k = 0; ++p; } - for(U32 j = 0; j < 32; ++j){ + for(U32 j = 0; j < 32; ++j) dest[i] |= ((target[j][p] & (1 << k)) >> k) << j; - } } //std::cerr << "out: " << internal_size << "/" << size/sizeof(U32) << std::endl; return size; } -inline const U32 permuteByteBits(const char* const data, - const U32 size, - char* destination) +inline U32 permuteByteBits(const char* const data, + const U32 size, + char* destination) { if(size == 0) return 0; U32 internal_size = size + (8 - size % 8); // Balance bytes @@ -134,7 +131,7 @@ inline const U32 permuteByteBits(const char* const data, return internal_size; } -inline const U32 unpermuteByteBits(char* data, +inline U32 unpermuteByteBits(char* data, const U32 size, char* destination) { @@ -177,22 +174,20 @@ inline const U32 unpermuteByteBits(char* data, } class CompressionContainer{ -private: +public: typedef CompressionContainer self_type; - -protected: typedef containers::DataContainer container_type; typedef io::BasicBuffer buffer_type; - typedef algorithm::PermutationManager permutation_type; + typedef yon_gt_ppa permutation_type; public: CompressionContainer() = default; virtual ~CompressionContainer() = default; - virtual const bool compress(permutation_type& manager) =0; - virtual const bool compress(container_type& container) =0; - virtual const bool compressStrides(container_type& container) =0; - virtual const bool decompress(container_type& container) =0; - virtual const bool decompressStrides(container_type& container) =0; + virtual bool Compress(container_type& container, permutation_type& manager) =0; + virtual bool Compress(container_type& container) =0; + virtual bool CompressStrides(container_type& container) =0; + virtual bool Decompress(container_type& container) =0; + virtual bool DecompressStrides(container_type& container) =0; protected: buffer_type buffer; diff --git a/lib/algorithm/compression/compression_manager.cpp b/lib/algorithm/compression/compression_manager.cpp index a9935e7..d242095 100644 --- a/lib/algorithm/compression/compression_manager.cpp +++ b/lib/algorithm/compression/compression_manager.cpp @@ -1,125 +1,76 @@ +#include "core/genotypes.h" #include "compression_manager.h" namespace tachyon{ namespace algorithm{ -bool CompressionManager::compress(variant_block_type& block, const BYTE general_level, const BYTE float_level){ - zstd_codec.setCompressionLevel(general_level); - zstd_codec.setCompressionLevelData(float_level); - if(block.header.controller.hasGTPermuted) zstd_codec.compress(block.ppa_manager); - zstd_codec.setCompressionLevel(general_level); - if(block.meta_contig_container.header.n_entries) zstd_codec.compress(block.meta_contig_container); - if(block.meta_positions_container.header.n_entries) zstd_codec.compress(block.meta_positions_container); - if(block.meta_refalt_container.header.n_entries) zstd_codec.compress(block.meta_refalt_container); - if(block.meta_controller_container.header.n_entries) zstd_codec.compress(block.meta_controller_container); - if(block.meta_quality_container.header.n_entries) zstd_codec.compress(block.meta_quality_container); - if(block.meta_names_container.header.n_entries){ - //zpaq_codec.compress(block.meta_names_container, false); - zstd_codec.compress(block.meta_names_container); - } +bool CompressionManager::Compress(variant_block_type& block, const BYTE general_level, const BYTE float_level){ + zstd_codec.SetCompressionLevel(general_level); + zstd_codec.SetCompressionLevelData(float_level); - //const std::string zpaq_cmd = "4"; - if(block.gt_rle8_container.header.n_entries){ - zstd_codec.compress(block.gt_rle8_container); - //zstd_codec.compressStrides(block.gt_rle8_container); - } - if(block.gt_rle16_container.header.n_entries){ - zstd_codec.compress(block.gt_rle16_container); - //zstd_codec.compressStrides(block.gt_rle16_container); - } - if(block.gt_rle32_container.header.n_entries){ - zstd_codec.compress(block.gt_rle32_container); - //zstd_codec.compressStrides(block.gt_rle32_container); - } - if(block.gt_rle64_container.header.n_entries){ - zstd_codec.compress(block.gt_rle64_container); - //zstd_codec.compressStrides(block.gt_rle64_container); + if(block.header.controller.hasGTPermuted){ + zstd_codec.SetCompressionLevel(22); + zstd_codec.Compress(block.base_containers[YON_BLK_PPA], *block.gt_ppa); } - if(block.meta_alleles_container.header.n_entries){ - zstd_codec.compress(block.meta_alleles_container); - //zstd_codec.compressStrides(block.meta_alleles_container); - } + zstd_codec.SetCompressionLevel(general_level); - if(block.gt_simple8_container.header.n_entries) zstd_codec.compress(block.gt_simple8_container); - if(block.gt_simple16_container.header.n_entries) zstd_codec.compress(block.gt_simple16_container); - if(block.gt_simple32_container.header.n_entries) zstd_codec.compress(block.gt_simple32_container); - if(block.gt_simple64_container.header.n_entries) zstd_codec.compress(block.gt_simple64_container); - if(block.gt_support_data_container.header.n_entries) zstd_codec.compress(block.gt_support_data_container); - if(block.meta_info_map_ids.header.n_entries) zstd_codec.compress(block.meta_info_map_ids); - if(block.meta_filter_map_ids.header.n_entries) zstd_codec.compress(block.meta_filter_map_ids); - if(block.meta_format_map_ids.header.n_entries) zstd_codec.compress(block.meta_format_map_ids); + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i){ + if(block.base_containers[i].header.n_entries){ + zstd_codec.Compress(block.base_containers[i]); + //std::cerr << "Compress: " << i << ": " << block.base_containers[i].buffer_data_uncompressed.size() << "->" << block.base_containers[i].buffer_data.size() << std::endl; + } + } for(U32 i = 0; i < block.footer.n_info_streams; ++i){ if(block.info_containers[i].header.data_header.controller.type == YON_TYPE_FLOAT || block.info_containers[i].header.data_header.controller.type == YON_TYPE_DOUBLE){ - zstd_codec.setCompressionLevelData(float_level); - zstd_codec.setCompressionLevelStrides(general_level); - //zpaq_codec.compress(block.info_containers[i]); - //zstd_codec.compress(block.info_containers[i]); - //zstd_codec.compress(block.info_containers[i]); + zstd_codec.SetCompressionLevelData(float_level); + zstd_codec.SetCompressionLevelStrides(general_level); } else { - zstd_codec.setCompressionLevel(general_level); - //zstd_codec.compress(block.info_containers[i]); - //zpaq_codec.compress(block.info_containers[i]); - //zstd_codec.compress(block.info_containers[i]); + zstd_codec.SetCompressionLevel(general_level); } - //zstd_codec.compress(block.info_containers[i]); - //zpaq_codec.compress(block.info_containers[i], "4"); - zstd_codec.compress(block.info_containers[i]); + zstd_codec.Compress(block.info_containers[i]); + //std::cerr << "Compress INFO: " << i << ": " << block.info_containers[i].buffer_data_uncompressed.size() << "->" << block.info_containers[i].buffer_data.size() << std::endl; } for(U32 i = 0; i < block.footer.n_format_streams; ++i){ if(block.format_containers[i].header.data_header.controller.type == YON_TYPE_FLOAT || block.format_containers[i].header.data_header.controller.type == YON_TYPE_DOUBLE){ - zstd_codec.setCompressionLevelData(float_level); - zstd_codec.setCompressionLevelStrides(general_level); - //zpaq_codec.compress(block.format_containers[i]); - //zstd_codec.compress(block.format_containers[i]); + zstd_codec.SetCompressionLevelData(float_level); + zstd_codec.SetCompressionLevelStrides(general_level); } else { - zstd_codec.setCompressionLevel(general_level); - //zstd_codec.compress(block.format_containers[i]); - //zpaq_codec.compress(block.format_containers[i]); + zstd_codec.SetCompressionLevel(general_level); } - zstd_codec.compress(block.format_containers[i]); + zstd_codec.Compress(block.format_containers[i]); + //std::cerr << "Compress FORMAT: " << i << ": " << block.format_containers[i].buffer_data_uncompressed.size() << "->" << block.format_containers[i].buffer_data.size() << std::endl; } return true; } -bool CompressionManager::decompress(variant_block_type& block){ - if(block.ppa_manager.PPA.size()){ - if(!this->decompress(block.ppa_manager)){ +bool CompressionManager::Decompress(variant_block_type& block){ + if(block.base_containers[YON_BLK_PPA].GetSizeCompressed()){ + if(!this->Decompress(block.base_containers[YON_BLK_PPA], *block.gt_ppa)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress GT permutation information!" << std::endl; return false; } } - if(block.meta_contig_container.getSizeCompressed()) if(!this->decompress(block.meta_contig_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta contig information!" << std::endl; return false; } - if(block.meta_positions_container.getSizeCompressed()) if(!this->decompress(block.meta_positions_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta positions information!" << std::endl; return false; } - if(block.meta_refalt_container.getSizeCompressed()) if(!this->decompress(block.meta_refalt_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta ref_alt information!" << std::endl; return false; } - if(block.meta_alleles_container.getSizeCompressed()) if(!this->decompress(block.meta_alleles_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta alleles information!" << std::endl; return false; } - if(block.meta_controller_container.getSizeCompressed()) if(!this->decompress(block.meta_controller_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta controller information!" << std::endl; return false; } - if(block.meta_quality_container.getSizeCompressed()) if(!this->decompress(block.meta_quality_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta quality information!" << std::endl; return false; } - if(block.gt_support_data_container.getSizeCompressed()) if(!this->decompress(block.gt_support_data_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotype support information!" << std::endl; return false; } - if(block.meta_info_map_ids.getSizeCompressed()) if(!this->decompress(block.meta_info_map_ids)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta INFO maps information!" << std::endl; return false; } - if(block.meta_names_container.getSizeCompressed()) if(!this->decompress(block.meta_names_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta names information!" << std::endl; return false; } - if(block.meta_filter_map_ids.getSizeCompressed()) if(!this->decompress(block.meta_filter_map_ids)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta FILTER maps information!" << std::endl; return false; } - if(block.meta_format_map_ids.getSizeCompressed()) if(!this->decompress(block.meta_format_map_ids)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress meta FORMAT maps information!" << std::endl; return false; } - if(block.gt_rle8_container.getSizeCompressed()) if(!this->decompress(block.gt_rle8_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (RLE-8) information!" << std::endl; return false; } - if(block.gt_rle16_container.getSizeCompressed()) if(!this->decompress(block.gt_rle16_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (RLE-16) information!" << std::endl; return false; } - if(block.gt_rle32_container.getSizeCompressed()) if(!this->decompress(block.gt_rle32_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (RLE-32) information!" << std::endl; return false; } - if(block.gt_rle64_container.getSizeCompressed()) if(!this->decompress(block.gt_rle64_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (RLE-64) information!" << std::endl; return false; } - if(block.gt_simple8_container.getSizeCompressed()) if(!this->decompress(block.gt_simple8_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (simple-8) information!" << std::endl; return false; } - if(block.gt_simple16_container.getSizeCompressed()) if(!this->decompress(block.gt_simple16_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (simple-16) information!" << std::endl; return false; } - if(block.gt_simple32_container.getSizeCompressed()) if(!this->decompress(block.gt_simple32_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (simple-32) information!" << std::endl; return false; } - if(block.gt_simple64_container.getSizeCompressed()) if(!this->decompress(block.gt_simple64_container)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress genotypes (simple-64) information!" << std::endl; return false; } + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i){ + if(block.base_containers[i].GetSizeCompressed()){ + if(!this->Decompress(block.base_containers[i])){ + std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress basic container!" << std::endl; + return false; + } + } + } for(U32 i = 0; i < block.footer.n_info_streams; ++i){ - if(block.info_containers[i].getSizeCompressed()){ - if(!this->decompress(block.info_containers[i])){ + if(block.info_containers[i].GetSizeCompressed()){ + if(!this->Decompress(block.info_containers[i])){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress INFO container " << i << "/" << block.footer.n_info_streams << "!" << std::endl; return false; } @@ -127,8 +78,8 @@ bool CompressionManager::decompress(variant_block_type& block){ } for(U32 i = 0; i < block.footer.n_format_streams; ++i){ - if(block.format_containers[i].getSizeCompressed()){ - if(!this->decompress(block.format_containers[i])){ + if(block.format_containers[i].GetSizeCompressed()){ + if(!this->Decompress(block.format_containers[i])){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress FORMAT container " << i << "/" << block.footer.n_format_streams << "!" << std::endl; return false; } @@ -138,11 +89,11 @@ bool CompressionManager::decompress(variant_block_type& block){ return true; } -bool CompressionManager::decompress(algorithm::PermutationManager& permutation_manager){ - return(this->zstd_codec.decompress(permutation_manager)); +bool CompressionManager::Decompress(container_type& container, yon_gt_ppa& gt_ppa){ + return(this->zstd_codec.Decompress(container, gt_ppa)); } -bool CompressionManager::decompress(container_type& container){ +bool CompressionManager::Decompress(container_type& container){ // Ascertain that data is not encrypted if(container.header.data_header.controller.encryption != YON_ENCRYPTION_NONE){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Data is encrypted. Provide a valid keychain and decrypt before proceeding..." << std::endl; @@ -150,20 +101,18 @@ bool CompressionManager::decompress(container_type& container){ } if(container.header.data_header.controller.encoder == YON_ENCODE_ZSTD){ - if(!this->zstd_codec.decompress(container)){ + if(!this->zstd_codec.Decompress(container)){ std::cerr << utility::timestamp("ERROR","CODEC-ZSTD") << "Failed to decompress data!" << std::endl; return false; } } else if(container.header.data_header.controller.encoder == YON_ENCODE_NONE){ - if(!this->no_codec.decompress(container)){ + if(!this->no_codec.Decompress(container)){ std::cerr << utility::timestamp("ERROR","CODEC-NONE") << "Failed to decompress data!" << std::endl; return false; } } else if(container.header.data_header.controller.encoder == YON_ENCODE_ZPAQ){ - if(!this->zpaq_codec.decompress(container)){ - std::cerr << utility::timestamp("ERROR","CODEC-ZPAQ") << "Failed to decompress data!" << std::endl; - return false; - } + std::cerr << utility::timestamp("ERROR","CODEC-ZPAQ") << "ZPAQ is no longer supported!" << std::endl; + return false; } else { std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress! Illegal codec!" << std::endl; return false; @@ -171,11 +120,12 @@ bool CompressionManager::decompress(container_type& container){ if(container.header.data_header.controller.mixedStride){ if(container.header.stride_header.controller.encoder == YON_ENCODE_ZSTD){ - if(!this->zstd_codec.decompressStrides(container)){ std::cerr << utility::timestamp("ERROR","CODEC-ZSTD") << "Failed to decompress strides!" << std::endl; return false; } + if(!this->zstd_codec.DecompressStrides(container)){ std::cerr << utility::timestamp("ERROR","CODEC-ZSTD") << "Failed to decompress strides!" << std::endl; return false; } } else if (container.header.stride_header.controller.encoder == YON_ENCODE_NONE){ - if(!this->no_codec.decompressStrides(container)){ std::cerr << utility::timestamp("ERROR","CODEC-NONE") << "Failed to decompress strides!" << std::endl; return false; } + if(!this->no_codec.DecompressStrides(container)){ std::cerr << utility::timestamp("ERROR","CODEC-NONE") << "Failed to decompress strides!" << std::endl; return false; } } else if (container.header.stride_header.controller.encoder == YON_ENCODE_ZPAQ){ - if(!this->zpaq_codec.decompressStrides(container)){ std::cerr << utility::timestamp("ERROR","CODEC-ZPAQ") << "Failed to decompress strides!" << std::endl; return false; } + std::cerr << utility::timestamp("ERROR","CODEC-ZPAQ") << "ZPAQ is no longer supported!" << std::endl; + return false; } else { std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to decompress! Illegal codec!" << std::endl; return false; diff --git a/lib/algorithm/compression/compression_manager.h b/lib/algorithm/compression/compression_manager.h index 2d68f0f..95405e8 100644 --- a/lib/algorithm/compression/compression_manager.h +++ b/lib/algorithm/compression/compression_manager.h @@ -2,7 +2,6 @@ #define ALGORITHM_COMPRESSION_COMPRESSION_MANAGER_H_ #include "uncompressed_codec.h" -#include "zpaq_codec.h" #include "zstd_codec.h" #include "containers/variant_block.h" @@ -14,7 +13,6 @@ class CompressionManager{ typedef CompressionManager self_type; typedef UncompressedCodec no_codec_type; typedef ZSTDCodec zstd_codec_type; - typedef ZPAQContainer zpaq_codec_type; typedef containers::VariantBlock variant_block_type; typedef containers::DataContainer container_type; @@ -22,21 +20,20 @@ class CompressionManager{ CompressionManager() = default; ~CompressionManager() = default; - bool compress(variant_block_type& block, const BYTE general_level = 6, const BYTE float_level = 3); - bool decompress(variant_block_type& block); - bool decompress(algorithm::PermutationManager& permutation_manager); + bool Compress(variant_block_type& block, const BYTE general_level = 6, const BYTE float_level = 3); + bool Decompress(variant_block_type& block); + bool Decompress(container_type& container, yon_gt_ppa& gt_ppa); /**< * Decompress an abstract data container * @param container Target container * @return Returns TRUE upon success or FALSE otherwise */ - bool decompress(container_type& container); + bool Decompress(container_type& container); public: no_codec_type no_codec; zstd_codec_type zstd_codec; - zpaq_codec_type zpaq_codec; }; } diff --git a/lib/algorithm/compression/genotype_encoder.cpp b/lib/algorithm/compression/genotype_encoder.cpp index 0714d86..9028e85 100644 --- a/lib/algorithm/compression/genotype_encoder.cpp +++ b/lib/algorithm/compression/genotype_encoder.cpp @@ -15,609 +15,443 @@ GenotypeEncoder::GenotypeEncoder(const U64 samples) : GenotypeEncoder::~GenotypeEncoder(){} -bool GenotypeEncoder::Encode(const bcf_type& bcf_entry, - meta_type& meta, - block_type& block, - const U32* const ppa) +bool GenotypeEncoder::Encode(const containers::VcfContainer& container, + meta_type* meta_entries, + block_type& block, + const yon_gt_ppa& permutation_array) const { - if(bcf_entry.body->n_allele + 1 >= 32768){ - std::cerr << utility::timestamp("ERROR", "ENCODER") << - "Illegal number of alleles (" << bcf_entry.body->n_allele + 1 << "). " - "Format is limited to 32768..." << std::endl; - return false; - } - - assert(bcf_entry.body != nullptr); - - meta.controller.biallelic = bcf_entry.body->n_allele == 2; - meta.controller.diploid = bcf_entry.gt_support.ploidy == 2; - meta.controller.gt_mixed_phasing = bcf_entry.gt_support.mixedPhasing; - meta.controller.gt_anyMissing = bcf_entry.gt_support.hasMissing; - meta.controller.gt_anyNA = bcf_entry.gt_support.hasMissing; - meta.controller.gt_phase = bcf_entry.gt_support.phase; - meta.controller.mixed_ploidy = bcf_entry.gt_support.hasEOV; - meta.controller.gt_phase = bcf_entry.gt_support.phase; - - if(bcf_entry.hasGenotypes){ - meta.controller.gt_available = true; - } else { - meta.controller.gt_available = false; - return true; - } - - // Assess cost and encode - rle_helper_type cost; - if(meta.controller.biallelic && meta.controller.diploid && meta.controller.mixed_ploidy == false){ // Case diploid and biallelic - cost = this->assessDiploidRLEBiallelic(bcf_entry, ppa); - - meta.controller.gt_compression_type = YON_GT_RLE_DIPLOID_BIALLELIC; - block.gt_support_data_container.Add((U32)cost.n_runs); - ++block.gt_support_data_container; - - switch(cost.word_width){ - case 1: - this->EncodeDiploidRLEBiallelic(bcf_entry, block.gt_rle8_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_BYTE; - ++block.gt_rle8_container; - ++this->stats_.rle_counts[0]; - break; - case 2: - this->EncodeDiploidRLEBiallelic(bcf_entry, block.gt_rle16_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U16; - ++block.gt_rle16_container; - ++this->stats_.rle_counts[1]; - break; - case 4: - this->EncodeDiploidRLEBiallelic(bcf_entry, block.gt_rle32_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U32; - ++block.gt_rle32_container; - ++this->stats_.rle_counts[2]; - break; - case 8: - this->EncodeDiploidRLEBiallelic(bcf_entry, block.gt_rle64_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U64; - ++block.gt_rle64_container; - ++this->stats_.rle_counts[3]; - break; - default: - std::cerr << utility::timestamp("ERROR","ENCODER") << "Illegal word width (" << (int)cost.word_width << ")... " << std::endl; - return false; - } - - return true; - } - else if(meta.controller.diploid) { // Case diploid n-allelic OR have EOV values - cost = this->assessDiploidRLEnAllelic(bcf_entry, ppa); - - // BCF-style cost - U32 costBCFStyle = this->n_samples; // cost for BCF-style encoding - if(bcf_entry.body->n_allele + 2 < 8) costBCFStyle *= sizeof(SBYTE); - else if(bcf_entry.body->n_allele + 2 < 128) costBCFStyle *= sizeof(S16); - else if(bcf_entry.body->n_allele + 2 < 32768) costBCFStyle *= sizeof(S32); - - // RLE is cheaper - if(cost.word_width * cost.n_runs < costBCFStyle){ - meta.controller.gt_compression_type = YON_GT_RLE_DIPLOID_NALLELIC; - block.gt_support_data_container.Add((U32)cost.n_runs); - ++block.gt_support_data_container; - - switch(cost.word_width){ - case 1: - this->EncodeDiploidRLEnAllelic(bcf_entry, block.gt_simple8_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_BYTE; - ++block.gt_simple8_container; - ++this->stats_.rle_simple_counts[0]; - break; - case 2: - this->EncodeDiploidRLEnAllelic(bcf_entry, block.gt_simple16_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U16; - ++block.gt_simple16_container; - ++this->stats_.rle_simple_counts[1]; - break; - case 4: - this->EncodeDiploidRLEnAllelic(bcf_entry, block.gt_simple32_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U32; - ++block.gt_simple32_container; - ++this->stats_.rle_simple_counts[2]; - break; - case 8: - this->EncodeDiploidRLEnAllelic(bcf_entry, block.gt_simple64_container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U64; - ++block.gt_simple64_container; - ++this->stats_.rle_simple_counts[3]; - break; - default: - std::cerr << utility::timestamp("ERROR","ENCODER") << "Illegal word width (" << (int)cost.word_width << ")... " << std::endl; - return false; + for(U32 i = 0; i < container.sizeWithoutCarryOver(); ++i){ + if(meta_entries[i].controller.gt_available == false) + continue; + + io::VcfGenotypeSummary gt_summary = container.GetGenotypeSummary(i, this->n_samples); + + yon_gt_assess assessed = this->Assess(container[i], gt_summary, permutation_array); + const uint8_t primitive = assessed.GetCheapestPrimitive(); + + meta_entries[i].controller.biallelic = (container[i]->n_allele == 2); + meta_entries[i].controller.diploid = (gt_summary.base_ploidy == 2); + meta_entries[i].controller.gt_mixed_phasing = gt_summary.mixed_phasing; + meta_entries[i].controller.gt_anyMissing = (gt_summary.n_missing != 0); + meta_entries[i].controller.gt_anyNA = (gt_summary.n_vector_end != 0); + meta_entries[i].controller.gt_phase = gt_summary.phase_if_uniform; + meta_entries[i].controller.mixed_ploidy = (gt_summary.n_vector_end != 0); + meta_entries[i].controller.gt_available = true; + meta_entries[i].n_base_ploidy = gt_summary.base_ploidy; + + if(container[i]->d.fmt[0].n == 2){ + if(container[i]->n_allele == 2 && gt_summary.n_vector_end == 0){ + uint64_t n_runs = 0; + switch(primitive){ + case(0): n_runs = this->EncodeDiploidBiallelic (container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_INT8]); break; + case(1): n_runs = this->EncodeDiploidBiallelic(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_INT16]); break; + case(2): n_runs = this->EncodeDiploidBiallelic(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_INT32]); break; + case(3): n_runs = this->EncodeDiploidBiallelic(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_INT64]); break; + default: + std::cerr << "illegal primitive type" << std::endl; + return false; + } + + meta_entries[i].controller.gt_primtive_type = TACHYON_GT_PRIMITIVE_TYPE(primitive); + meta_entries[i].controller.gt_compression_type = YON_GT_RLE_DIPLOID_BIALLELIC; + block.base_containers[YON_BLK_GT_SUPPORT].Add((U32)n_runs); + ++block.base_containers[YON_BLK_GT_SUPPORT]; + + } else { + uint64_t n_runs = 0; + switch(primitive){ + case(0): n_runs = this->EncodeDiploidMultiAllelic (container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_S_INT8]); break; + case(1): n_runs = this->EncodeDiploidMultiAllelic(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_S_INT16]); break; + case(2): n_runs = this->EncodeDiploidMultiAllelic(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_S_INT32]); break; + case(3): n_runs = this->EncodeDiploidMultiAllelic(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_S_INT64]); break; + default: + std::cerr << "illegal primitive type" << std::endl; + return false; + } + + meta_entries[i].controller.gt_primtive_type = TACHYON_GT_PRIMITIVE_TYPE(primitive); + meta_entries[i].controller.gt_compression_type = YON_GT_RLE_DIPLOID_NALLELIC; + block.base_containers[YON_BLK_GT_SUPPORT].Add((U32)n_runs); + ++block.base_containers[YON_BLK_GT_SUPPORT]; } - return true; } - // BCF style is cheaper else { - //std::cerr << "BCF-style cheaper" << std::endl; - - meta.controller.gt_compression_type = YON_GT_BCF_DIPLOID; - block.gt_support_data_container.Add((U32)this->n_samples); - ++block.gt_support_data_container; - - - U64 n_runs = this->n_samples; - if(bcf_entry.body->n_allele + 2 < 8){ - meta.controller.gt_primtive_type = YON_GT_BYTE; - this->EncodeDiploidBCF(bcf_entry, block.gt_simple8_container, n_runs, ppa); - ++block.gt_simple8_container; - ++this->stats_.diploid_bcf_counts[0]; - } - else if(bcf_entry.body->n_allele + 2 < 128){ - meta.controller.gt_primtive_type = YON_GT_U16; - this->EncodeDiploidBCF (bcf_entry, block.gt_simple16_container, n_runs, ppa); - ++block.gt_simple16_container; - ++this->stats_.diploid_bcf_counts[1]; - } - else if(bcf_entry.body->n_allele + 2 < 32768){ - meta.controller.gt_primtive_type = YON_GT_U32; - this->EncodeDiploidBCF (bcf_entry, block.gt_simple32_container, n_runs, ppa); - ++block.gt_simple32_container; - ++this->stats_.diploid_bcf_counts[2]; - } - else { - std::cerr << utility::timestamp("ERROR", "ENCODER") << - "Illegal number of alleles (" << bcf_entry.body->n_allele + 1 << "). " - "Format is limited to 32768..." << std::endl; + uint64_t n_runs = 0; + switch(primitive){ + case(0): n_runs = this->EncodeMultiploid (container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_N_INT8]); break; + case(1): n_runs = this->EncodeMultiploid(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_N_INT16]); break; + case(2): n_runs = this->EncodeMultiploid(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_N_INT32]); break; + case(3): n_runs = this->EncodeMultiploid(container[i],gt_summary,permutation_array,block.base_containers[YON_BLK_GT_N_INT64]); break; + default: + std::cerr << "illegal primitive type" << std::endl; return false; } - return true; - } - } - // temp - else { - std::cerr << "other bcf style: n_alleles: " << bcf_entry.body->n_allele << ",ploidy: " << bcf_entry.gt_support.ploidy << std::endl; - meta.controller.gt_compression_type = YON_GT_BCF_STYLE; - block.gt_support_data_container.Add((U32)this->n_samples*bcf_entry.gt_support.ploidy); - ++block.gt_support_data_container; - U64 n_runs = this->n_samples*bcf_entry.gt_support.ploidy; - - if(bcf_entry.body->n_allele + 2 < 8){ - this->EncodeBCFStyle(bcf_entry, block.gt_simple8_container, n_runs); - ++block.gt_simple8_container; - meta.controller.gt_primtive_type = YON_GT_BYTE; - ++this->stats_.bcf_counts[0]; - } - else if(bcf_entry.body->n_allele + 2 < 128){ - this->EncodeBCFStyle (bcf_entry, block.gt_simple16_container, n_runs); - ++block.gt_simple16_container; - meta.controller.gt_primtive_type = YON_GT_U16; - ++this->stats_.bcf_counts[1]; - } - else if(bcf_entry.body->n_allele + 2 < 32768){ - this->EncodeBCFStyle (bcf_entry, block.gt_simple32_container, n_runs); - ++block.gt_simple32_container; - meta.controller.gt_primtive_type = YON_GT_U32; - ++this->stats_.bcf_counts[2]; + meta_entries[i].controller.gt_primtive_type = TACHYON_GT_PRIMITIVE_TYPE(primitive); + meta_entries[i].controller.gt_compression_type = YON_GT_RLE_NPLOID; + block.base_containers[YON_BLK_GT_SUPPORT].Add((U32)n_runs); + ++block.base_containers[YON_BLK_GT_SUPPORT]; } - else { - std::cerr << utility::timestamp("ERROR", "ENCODER") << - "Illegal number of alleles (" << bcf_entry.body->n_allele + 1 << "). " - "Format is limited to 32768..." << std::endl; - return false; - } - return true; } - return false; + return true; } -bool GenotypeEncoder::EncodeParallel(const bcf_reader_type& bcf_reader, - meta_type* meta_entries, - block_type& block, - const U32* const ppa, - const U32 n_threads) +yon_gt_assess GenotypeEncoder::Assess(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array) const { - GenotypeEncoderSlaveHelper* helpers = new GenotypeEncoderSlaveHelper[bcf_reader.size()]; - CalcSlave* slaves = new CalcSlave[n_threads]; - std::vector threads(n_threads); - for(U32 i = 0; i < n_threads; ++i) threads[i] = slaves[i].Start(*this, i, n_threads, bcf_reader, meta_entries, ppa, helpers); - for(U32 i = 0; i < n_threads; ++i) threads[i]->join(); - for(U32 i = 0; i < bcf_reader.size(); ++i) { - block += helpers[i]; - this->updateStatistics(helpers[i]); + // Special case of diploid record. + yon_gt_assess assessed; + if(entry->d.fmt[0].n == 2){ + if(entry->n_allele == 2 && gt_summary.n_vector_end == 0) + assessed = this->AssessDiploidBiallelic(entry,gt_summary,permutation_array); + else + assessed = this->AssessDiploidMultiAllelic(entry,gt_summary,permutation_array); } - - delete [] slaves; - delete [] helpers; - - return true; + // All other ploidy is assessed with this multiploid function. + else { + assessed = this->AssessMultiploid(entry,gt_summary,permutation_array); + } + return assessed; } -// Todo: pass thread with slave in it -bool GenotypeEncoder::EncodeParallel(const bcf_type& bcf_entry, - meta_type& meta, - const U32* const ppa, - GenotypeEncoderSlaveHelper& slave) const +yon_gt_assess GenotypeEncoder::AssessDiploidBiallelic(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array) const { - if(bcf_entry.body->n_allele + 1 >= 32768){ - std::cerr << utility::timestamp("ERROR", "ENCODER") << - "Illegal number of alleles (" << bcf_entry.body->n_allele + 1 << "). " - "Format is limited to 32768..." << std::endl; - return false; - } + assert(entry->d.fmt[0].n == 2); + assert(entry->n_allele == 2); + const uint8_t base_ploidy = entry->d.fmt[0].n; + const uint8_t* gt = entry->d.fmt[0].p; + const uint32_t l_gt = entry->d.fmt[0].p_len; + assert(permutation_array.n_samples * base_ploidy == l_gt); + assert(base_ploidy * sizeof(uint8_t) * this->n_samples == l_gt); + + // Track all possible outcomes. + // 1: BYTE + Permuted + // 2: U16 + Permuted + // 3: U32 + Permuted + // 4: U64 + Permuted + // 5: BYTE + No permutation + // 6: U16 + No permutation + // 7: U32 + No permutation + // 8: U64 + No permutation + uint64_t n_runs[8]; // Number of runs. + uint64_t l_runs[8]; // Current run length. + for(U32 i = 0; i < 8; ++i) l_runs[i] = 1; + for(U32 i = 0; i < 8; ++i) n_runs[i] = 0; + + // 1 + hasMissing + hasMixedPhasing + const BYTE shift = gt_summary.n_missing ? 2 : 1; // 1-bits enough when no data missing {0,1}, 2-bits required when missing is available {0,1,2} + const BYTE add = gt_summary.mixed_phasing ? 1 : 0; - assert(bcf_entry.body != nullptr); - - meta.controller.biallelic = bcf_entry.body->n_allele == 2; - meta.controller.diploid = bcf_entry.gt_support.ploidy == 2; - meta.controller.gt_mixed_phasing = bcf_entry.gt_support.mixedPhasing; - meta.controller.gt_anyMissing = bcf_entry.gt_support.hasMissing; - meta.controller.gt_anyNA = bcf_entry.gt_support.hasMissing; - meta.controller.gt_phase = bcf_entry.gt_support.phase; - meta.controller.mixed_ploidy = bcf_entry.gt_support.hasEOV; - meta.controller.gt_phase = bcf_entry.gt_support.phase; - - if(bcf_entry.hasGenotypes){ - meta.controller.gt_available = true; - } else { - meta.controller.gt_available = false; - return true; - } - - U32 start_capacity = this->n_samples * 2 / 10; - if(this->n_samples * 2 / 10 < 65536) start_capacity = 65536; - slave.container.buffer_data_uncompressed.resize(start_capacity); - - //GenotypeEncoderSlaveHelper slave(start_capacity); - - // Assess cost and encode - rle_helper_type cost; - if(meta.controller.biallelic && meta.controller.diploid && meta.controller.mixed_ploidy == false){ // Case diploid and biallelic - cost = this->assessDiploidRLEBiallelic(bcf_entry, ppa); - - slave.encoding_type = YON_GT_RLE_DIPLOID_BIALLELIC; - slave.n_runs = cost.n_runs; - - meta.controller.gt_compression_type = YON_GT_RLE_DIPLOID_BIALLELIC; - //block.gt_support_data_container.Add((U32)cost.n_runs); - //++block.gt_support_data_container; - - switch(cost.word_width){ - case 1: - this->EncodeDiploidRLEBiallelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_BYTE; - slave.gt_primitive = YON_GT_BYTE; - //++block.gt_rle8_container; - //++this->stats_.rle_counts[0]; - break; - case 2: - this->EncodeDiploidRLEBiallelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U16; - slave.gt_primitive = YON_GT_U16; - //++block.gt_rle16_container; - //++this->stats_.rle_counts[1]; - break; - case 4: - this->EncodeDiploidRLEBiallelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U32; - slave.gt_primitive = YON_GT_U32; - //++block.gt_rle32_container; - //++this->stats_.rle_counts[2]; - break; - case 8: - this->EncodeDiploidRLEBiallelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U64; - slave.gt_primitive = YON_GT_U64; - //++block.gt_rle64_container; - //++this->stats_.rle_counts[3]; - break; - default: - std::cerr << utility::timestamp("ERROR","ENCODER") << "Illegal word width (" << (int)cost.word_width << ")... " << std::endl; - return false; + // Run limits + uint64_t limits[4]; + limits[0] = pow(2, 8*sizeof(BYTE) - (base_ploidy*shift + add)) - 1; + limits[1] = pow(2, 8*sizeof(U16) - (base_ploidy*shift + add)) - 1; + limits[2] = pow(2, 8*sizeof(U32) - (base_ploidy*shift + add)) - 1; + limits[3] = pow(2, 8*sizeof(U64) - (base_ploidy*shift + add)) - 1; + + U32 rle_current_ref = YON_PACK_GT_DIPLOID(gt[0], gt[1], shift, add); + U32 rle_ppa_current_ref = YON_PACK_GT_DIPLOID(gt[permutation_array[0] * sizeof(int8_t) * base_ploidy], + gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + 1], + shift, add); + + // Keep track of the linear offset in the genotype + // data stream. The permuted offset is computed directly + // and does not need to be tracked. + uint32_t l_gt_offset = 2; + + // Iterate over all available samples. + for(U32 i = 1; i < this->n_samples; ++i, l_gt_offset += 2){ + const uint8_t* gt_ppa_target = >[permutation_array[i] * sizeof(int8_t) * base_ploidy]; + U32 rle_current = YON_PACK_GT_DIPLOID(gt[l_gt_offset], gt[l_gt_offset + 1], shift, add); + U32 rle_ppa_current = YON_PACK_GT_DIPLOID(gt_ppa_target[0], gt_ppa_target[1], shift, add); + + if(rle_current != rle_current_ref){ + for(U32 k = 4; k < 8; ++k) ++n_runs[k]; + for(U32 k = 4; k < 8; ++k) l_runs[k] = 0; + rle_current_ref = rle_current; } - return true; - } - else if(meta.controller.diploid) { // Case diploid n-allelic OR have EOV values - cost = this->assessDiploidRLEnAllelic(bcf_entry, ppa); - - // BCF-style cost - U32 costBCFStyle = this->n_samples; // cost for BCF-style encoding - if(bcf_entry.body->n_allele + 2 < 8) costBCFStyle *= sizeof(SBYTE); - else if(bcf_entry.body->n_allele + 2 < 128) costBCFStyle *= sizeof(S16); - else if(bcf_entry.body->n_allele + 2 < 32768) costBCFStyle *= sizeof(S32); - - // RLE is cheaper - if(cost.word_width * cost.n_runs < costBCFStyle){ - meta.controller.gt_compression_type = YON_GT_RLE_DIPLOID_NALLELIC; - //block.gt_support_data_container.Add((U32)cost.n_runs); - - slave.encoding_type = YON_GT_RLE_DIPLOID_NALLELIC; - slave.n_runs = cost.n_runs; - - //++block.gt_support_data_container; - - switch(cost.word_width){ - case 1: - this->EncodeDiploidRLEnAllelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_BYTE; - slave.gt_primitive = YON_GT_BYTE; - //++block.gt_simple8_container; - // ++this->stats_.rle_simple_counts[0]; - break; - case 2: - this->EncodeDiploidRLEnAllelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U16; - slave.gt_primitive = YON_GT_U16; - //++block.gt_simple16_container; - //++this->stats_.rle_simple_counts[1]; - break; - case 4: - this->EncodeDiploidRLEnAllelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U32; - slave.gt_primitive = YON_GT_U32; - //++block.gt_simple32_container; - //++this->stats_.rle_simple_counts[2]; - break; - case 8: - this->EncodeDiploidRLEnAllelic(bcf_entry, slave.container, ppa, cost); - meta.controller.gt_primtive_type = YON_GT_U64; - slave.gt_primitive = YON_GT_U64; - //++block.gt_simple64_container; - //++this->stats_.rle_simple_counts[3]; - break; - default: - std::cerr << utility::timestamp("ERROR","ENCODER") << "Illegal word width (" << (int)cost.word_width << ")... " << std::endl; - return false; - } - return true; + // Overflow: trigger a break + for(U32 k = 4; k < 8; ++k){ + if(l_runs[k] == limits[k-4]){ ++n_runs[k]; l_runs[k] = 0; } + ++l_runs[k]; } - // BCF style is cheaper - else { - //std::cerr << "BCF-style cheaper" << std::endl; - meta.controller.gt_compression_type = YON_GT_BCF_DIPLOID; - //block.gt_support_data_container.Add((U32)this->n_samples); - - slave.encoding_type = YON_GT_BCF_DIPLOID; - slave.n_runs = this->n_samples; + if(rle_ppa_current != rle_ppa_current_ref){ + for(U32 k = 0; k < 4; ++k) ++n_runs[k]; + for(U32 k = 0; k < 4; ++k) l_runs[k] = 0; + rle_ppa_current_ref = rle_ppa_current; + } - //++block.gt_support_data_container; + // Overflow: trigger a break + for(U32 k = 0; k < 4; ++k){ + if(l_runs[k] == limits[k]){ ++n_runs[k]; l_runs[k] = 0; } + ++l_runs[k]; + } + } + assert(l_gt_offset == l_gt); + for(U32 k = 0; k < 8; ++k) ++n_runs[k]; - U64 n_runs = this->n_samples; - if(bcf_entry.body->n_allele + 2 < 8){ - meta.controller.gt_primtive_type = YON_GT_BYTE; - slave.gt_primitive = YON_GT_BYTE; - this->EncodeDiploidBCF(bcf_entry, slave.container, n_runs, ppa); - //++block.gt_simple8_container; - //++this->stats_.diploid_bcf_counts[0]; - } - else if(bcf_entry.body->n_allele + 2 < 128){ - meta.controller.gt_primtive_type = YON_GT_U16; - slave.gt_primitive = YON_GT_U16; - this->EncodeDiploidBCF(bcf_entry, slave.container, n_runs, ppa); - //++block.gt_simple16_container; - //++this->stats_.diploid_bcf_counts[1]; - } - else if(bcf_entry.body->n_allele + 2 < 32768){ - meta.controller.gt_primtive_type = YON_GT_U32; - slave.gt_primitive = YON_GT_U32; - this->EncodeDiploidBCF(bcf_entry, slave.container, n_runs, ppa); - //++block.gt_simple32_container; - //++this->stats_.diploid_bcf_counts[2]; - } - else { - std::cerr << utility::timestamp("ERROR", "ENCODER") << - "Illegal number of alleles (" << bcf_entry.body->n_allele + 2 << "). " - "Format is limited to 32768..." << std::endl; - return false; - } - return true; - } + yon_gt_assess sum; + for(U32 k = 0; k < 4; ++k){ + sum.n_runs[k] = n_runs[k]; + sum.n_cost[k] = n_runs[k]*(k+1); } - else { - //std::cerr << "other bcf style: n_alleles: " << bcf_entry.body->n_allele << ",ploidy: " << bcf_entry.gt_support.ploidy << std::endl; - meta.controller.gt_compression_type = YON_GT_BCF_STYLE; - //block.gt_support_data_container.Add((U32)this->n_samples*bcf_entry.gt_support.ploidy); + for(U32 k = 4; k < 8; ++k){ + sum.n_runs[k] = n_runs[k]; + sum.n_cost[k] = n_runs[k]*((k-4)+1); + } + sum.method = 0; + + /* + std::cout << entry->pos + 1 << "\tR"; + for(U32 i = 0; i < 4; ++i) + std::cout << "\t" << n_runs[i] << "\t" << n_runs[i]*(i+1); + for(U32 i = 4; i < 8; ++i) + std::cout << "\t" << n_runs[i] << "\t" << n_runs[i]*((i-4)+1); + std::cout << std::endl; + */ + + return sum; +} + +yon_gt_assess GenotypeEncoder::AssessDiploidMultiAllelic(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array) const +{ + assert(entry->d.fmt[0].n == 2); + const uint8_t base_ploidy = entry->d.fmt[0].n; + const uint8_t* gt = entry->d.fmt[0].p; + const uint32_t l_gt = entry->d.fmt[0].p_len; + assert(permutation_array.n_samples * base_ploidy == l_gt); + assert(base_ploidy * sizeof(uint8_t) * this->n_samples == l_gt); + + // Track all possible outcomes. + // 1: BYTE + Permuted + // 2: U16 + Permuted + // 3: U32 + Permuted + // 4: U64 + Permuted + // 5: BYTE + No permutation + // 6: U16 + No permutation + // 7: U32 + No permutation + // 8: U64 + No permutation + int64_t n_runs[8]; // Number of runs. + int64_t l_runs[8]; // Current run length. + for(U32 i = 0; i < 8; ++i) l_runs[i] = 1; + for(U32 i = 0; i < 8; ++i) n_runs[i] = 0; - slave.encoding_type = YON_GT_BCF_STYLE; - slave.n_runs = this->n_samples*bcf_entry.gt_support.ploidy; + // Assess RLE cost + const BYTE shift = ceil(log2(entry->n_allele + 2 + 1)); + const BYTE add = gt_summary.mixed_phasing ? 1 : 0; - //++block.gt_support_data_container; + // Run limits + // Values set to signed integers as values can underflow if + // the do not fit in the word size. + // Ploidy*shift_size bits for alleles and 1 bit for phase information (if required) + // Cost: 2^(8*word_width - (ploidy*(n_alleles + has_missing + hasEOV + 1) + has_mixed_phasing)) + int64_t limits[4]; + limits[0] = pow(2, 8*sizeof(BYTE) - (base_ploidy*shift + add)) - 1; + limits[1] = pow(2, 8*sizeof(U16) - (base_ploidy*shift + add)) - 1; + limits[2] = pow(2, 8*sizeof(U32) - (base_ploidy*shift + add)) - 1; + limits[3] = pow(2, 8*sizeof(U64) - (base_ploidy*shift + add)) - 1; + bool banned_limit[4]; + if(limits[0] <= 0){ limits[0] = std::numeric_limits::max(); banned_limit[0] = true; } + if(limits[1] <= 0){ limits[1] = std::numeric_limits::max(); banned_limit[1] = true; } + if(limits[2] <= 0){ limits[2] = std::numeric_limits::max(); banned_limit[2] = true; } + + uint8_t gt_remap[256]; + memset(gt_remap, 256, 255); + for(U32 i = 0; i <= entry->n_allele; ++i){ + gt_remap[i << 1] = ((i+1) << 1); + gt_remap[(i << 1) + 1] = ((i+1) << 1) + 1; + } + gt_remap[0] = 0; + gt_remap[129] = 1; + + U32 rle_current_ref = YON_PACK_GT_DIPLOID_NALLELIC(gt_remap[gt[0]] >> 1, + gt_remap[gt[1]] >> 1, + shift, add, + gt_remap[gt[1]]); + + U32 rle_ppa_current_ref = YON_PACK_GT_DIPLOID_NALLELIC(gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy]] >> 1, + gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + 1]] >> 1, + shift, add, + gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + 1]]); + + assert(ceil(log2(gt_remap[gt[0]] >> 1)) <= shift); + assert(ceil(log2(gt_remap[gt[1]] >> 1)) <= shift); + assert(ceil(log2(gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy]] >> 1)) <= shift); + assert(ceil(log2(gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + 1]] >> 1)) <= shift); + + // Keep track of the linear offset in the genotype + // data stream. The permuted offset is computed directly + // and does not need to be tracked. + uint32_t l_gt_offset = base_ploidy; + + //std::cerr << entry->pos + 1 << "\t" << (gt[0]>>1) << "|" << (gt[1]>>1); + + // Iterate over all available samples. + for(U32 i = 1; i < this->n_samples; ++i, l_gt_offset += base_ploidy){ + const uint8_t* gt_ppa_target = >[permutation_array[i] * sizeof(int8_t) * base_ploidy]; + U32 rle_current = YON_PACK_GT_DIPLOID_NALLELIC(gt_remap[gt[l_gt_offset]] >> 1, gt_remap[gt[l_gt_offset+1]] >> 1, shift, add, gt_remap[gt[1]]); + U32 rle_ppa_current = YON_PACK_GT_DIPLOID_NALLELIC(gt_remap[gt_ppa_target[0]] >> 1, + gt_remap[gt_ppa_target[1]] >> 1, + shift, add, + gt_remap[gt_ppa_target[1]]); + + assert(ceil(log2(gt_remap[gt[l_gt_offset]] >> 1)) <= shift); + assert(ceil(log2(gt_remap[gt[l_gt_offset+1]] >> 1)) <= shift); + assert(ceil(log2(gt_remap[gt_ppa_target[0]] >> 1)) <= shift); + assert(ceil(log2(gt_remap[gt_ppa_target[1]] >> 1)) <= shift); + + assert(gt[l_gt_offset] != 255); + assert(gt[l_gt_offset+1] != 255); + assert(gt_remap[gt_ppa_target[0]] != 255); + assert(gt_remap[gt_ppa_target[1]] != 255); + + if(rle_current != rle_current_ref){ + for(U32 k = 4; k < 8; ++k) ++n_runs[k]; + for(U32 k = 4; k < 8; ++k) l_runs[k] = 0; + rle_current_ref = rle_current; + } - U64 n_runs = this->n_samples*bcf_entry.gt_support.ploidy; + // Overflow: trigger a break + for(U32 k = 4; k < 8; ++k){ + if(l_runs[k] == limits[k-4]){ ++n_runs[k]; l_runs[k] = 0; } + ++l_runs[k]; + } - if(bcf_entry.body->n_allele + 2 < 8){ - this->EncodeBCFStyle(bcf_entry, slave.container, n_runs); - //++block.gt_simple8_container; - meta.controller.gt_primtive_type = YON_GT_BYTE; - slave.gt_primitive = YON_GT_BYTE; - //++this->stats_.bcf_counts[0]; + if(rle_ppa_current != rle_ppa_current_ref){ + for(U32 k = 0; k < 4; ++k) ++n_runs[k]; + for(U32 k = 0; k < 4; ++k) l_runs[k] = 0; + rle_ppa_current_ref = rle_ppa_current; } - else if(bcf_entry.body->n_allele + 2 < 128){ - this->EncodeBCFStyle (bcf_entry, slave.container, n_runs); - //++block.gt_simple16_container; - meta.controller.gt_primtive_type = YON_GT_U16; - slave.gt_primitive = YON_GT_U16; - //++this->stats_.bcf_counts[1]; + + // Overflow: trigger a break + for(U32 k = 0; k < 4; ++k){ + if(l_runs[k] == limits[k]){ ++n_runs[k]; l_runs[k] = 0; } + ++l_runs[k]; } - else if(bcf_entry.body->n_allele + 2 < 32768){ - this->EncodeBCFStyle (bcf_entry, slave.container, n_runs); - //++block.gt_simple32_container; - meta.controller.gt_primtive_type = YON_GT_U32; - slave.gt_primitive = YON_GT_U32; - //++this->stats_.bcf_counts[2]; + } + assert(l_gt_offset == l_gt); + for(U32 k = 0; k < 8; ++k) ++n_runs[k]; + + yon_gt_assess sum; + for(U32 k = 0; k < 4; ++k){ + if(banned_limit[k]){ + sum.n_runs[k] = std::numeric_limits::max(); + sum.n_cost[k] = std::numeric_limits::max(); + } else { + sum.n_runs[k] = n_runs[k]; + sum.n_cost[k] = n_runs[k]*(k+1); } - else { - std::cerr << utility::timestamp("ERROR", "ENCODER") << - "Illegal number of alleles (" << bcf_entry.body->n_allele + 2 << "). " - "Format is limited to 32768..." << std::endl; - return false; + } + for(U32 k = 4; k < 8; ++k){ + if(banned_limit[k-4]){ + sum.n_runs[k] = std::numeric_limits::max(); + sum.n_cost[k] = std::numeric_limits::max(); + } else { + sum.n_runs[k] = n_runs[k]; + sum.n_cost[k] = n_runs[k]*((k-4)+1); } - return true; } - return false; -} + sum.method = 1; -const GenotypeEncoder::rle_helper_type GenotypeEncoder::assessDiploidRLEBiallelic(const bcf_type& bcf_entry, const U32* const ppa) const{ - // Setup - const BYTE ploidy = 2; - const BYTE shift = bcf_entry.gt_support.hasMissing ? 2 : 1; // 1-bits enough when no data missing {0,1}, 2-bits required when missing is available {0,1,2} - const BYTE add = bcf_entry.gt_support.mixedPhasing ? 1 : 0; - U32 n_runs_byte = 0; U32 run_length_byte = 1; - U32 n_runs_u16 = 0; U32 run_length_u16 = 1; - U32 n_runs_u32 = 0; U32 run_length_u32 = 1; - U64 n_runs_u64 = 0; U64 run_length_u64 = 1; - // Run limits - const BYTE BYTE_limit = pow(2, 8*sizeof(BYTE) - (ploidy*shift + add)) - 1; - const U16 U16_limit = pow(2, 8*sizeof(U16) - (ploidy*shift + add)) - 1; - const U32 U32_limit = pow(2, 8*sizeof(U32) - (ploidy*shift + add)) - 1; - const U64 U64_limit = pow(2, 8*sizeof(U64) - (ploidy*shift + add)) - 1; - - // First ref - const char* const data = &bcf_entry.data[bcf_entry.formatID[0].l_offset]; - const BYTE& allele1_2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0]]); - const BYTE& allele2_2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0] + sizeof(BYTE)]); - U32 ref = YON_PACK_GT_DIPLOID(allele2_2, allele1_2, shift, add); - - // Cycle over GT values - U32 ppa_pos = 1; - for(U32 i = ploidy; i < this->n_samples * ploidy; i += ploidy){ - const BYTE& allele1 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos]]); - const BYTE& allele2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos] + sizeof(BYTE)]); - U32 internal = YON_PACK_GT_DIPLOID(allele2, allele1, shift, add); - - // Extend or break run - if(ref != internal){ - ++n_runs_byte; run_length_byte = 0; - ++n_runs_u16; run_length_u16 = 0; - ++n_runs_u32; run_length_u32 = 0; - ++n_runs_u64; run_length_u64 = 0; - ref = internal; - } + /* + std::cout << entry->pos + 1 << "\tM"; + for(U32 i = 0; i < 8; ++i) + std::cout << "\t" << sum.n_runs[i] << "\t" << sum.n_cost[i]; + std::cout << std::endl; + */ - // Overflow: trigger a break - if(run_length_byte == BYTE_limit){ ++n_runs_byte; run_length_byte = 0; } - if(run_length_u16 == U16_limit) { ++n_runs_u16; run_length_u16 = 0; } - if(run_length_u32 == U32_limit) { ++n_runs_u32; run_length_u32 = 0; } - if(run_length_u64 == U64_limit) { ++n_runs_u64; run_length_u64 = 0; } - - // Update all counts - ++run_length_byte; - ++run_length_u16; - ++run_length_u32; - ++run_length_u64; - ++ppa_pos; - } - // Final runs - ++n_runs_byte; - ++n_runs_u16; - ++n_runs_u32; - ++n_runs_u64; - - // Determine best action - U32 smallest_cost = n_runs_byte*sizeof(BYTE); - U64 chosen_runs = n_runs_byte; - BYTE word_width = sizeof(BYTE); - if(n_runs_u16*sizeof(U16) < smallest_cost){ smallest_cost = n_runs_u16*sizeof(U16); word_width = sizeof(U16); chosen_runs = n_runs_u16; } - if(n_runs_u32*sizeof(U32) < smallest_cost){ smallest_cost = n_runs_u32*sizeof(U32); word_width = sizeof(U32); chosen_runs = n_runs_u32; } - if(n_runs_u64*sizeof(U64) < smallest_cost){ smallest_cost = n_runs_u64*sizeof(U64); word_width = sizeof(U64); chosen_runs = n_runs_u64; } - - assert(ppa_pos == n_samples); - return(rle_helper_type(word_width, chosen_runs)); + return sum; } -const GenotypeEncoder::rle_helper_type GenotypeEncoder::assessDiploidRLEnAllelic(const bcf_type& bcf_entry, const U32* const ppa) const{ - const BYTE ploidy = 2; +yon_gt_assess GenotypeEncoder::AssessMultiploid(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array) const +{ + const uint8_t base_ploidy = entry->d.fmt[0].n; + const uint8_t* gt = entry->d.fmt[0].p; + const uint32_t l_gt = entry->d.fmt[0].p_len; + assert(permutation_array.n_samples * base_ploidy == l_gt); + assert(base_ploidy * sizeof(uint8_t) * this->n_samples == l_gt); + + uint64_t n_runs[8]; // Number of runs. + uint64_t l_runs[8]; // Current run length. + for(U32 i = 0; i < 8; ++i) l_runs[i] = 1; + for(U32 i = 0; i < 8; ++i) n_runs[i] = 0; + uint64_t limits[4]; + limits[0] = std::numeric_limits::max(); + limits[1] = std::numeric_limits::max(); + limits[2] = std::numeric_limits::max(); + limits[3] = std::numeric_limits::max(); + + //std::cerr << entry->pos + 1 << " : " << this->n_samples << "\t"; + uint64_t hash_value_ref = XXH64(>[0], sizeof(int8_t) * base_ploidy, 89231478); + uint64_t hash_value_ppa_ref = XXH64(>[permutation_array[0] * sizeof(int8_t) * base_ploidy], sizeof(int8_t) * base_ploidy, 89231478); + + uint32_t l_gt_offset = base_ploidy; + + // Iterate over all available samples. + for(U32 i = 1; i < this->n_samples; ++i, l_gt_offset += base_ploidy){ + const uint8_t* gt_ppa_target = >[permutation_array[i] * sizeof(int8_t) * base_ploidy]; + uint64_t hash_value = XXH64(>[l_gt_offset], sizeof(int8_t) * base_ploidy, 89231478); + uint64_t hash_value_ppa = XXH64(gt_ppa_target, sizeof(int8_t) * base_ploidy, 89231478); + + if(hash_value_ppa != hash_value_ppa_ref){ + for(U32 k = 0; k < 4; ++k) ++n_runs[k]; + for(U32 k = 0; k < 4; ++k) l_runs[k] = 0; + hash_value_ppa_ref = hash_value_ppa; + } - // Assess RLE cost - const BYTE shift = ceil(log2(bcf_entry.body->n_allele + 2 + 1)); - const BYTE add = bcf_entry.gt_support.mixedPhasing ? 1 : 0; + // Overflow: trigger a break + for(U32 k = 0; k < 4; ++k){ + if(l_runs[k] == limits[k]){ ++n_runs[k]; l_runs[k] = 0; } + ++l_runs[k]; + } - // Run limits - // Values set to signed integers as values can underflow if - // the do not fit in the word size - // Ploidy*shift_size bits for alleles and 1 bit for phase information (if required) - // Cost: 2^(8*word_width - (ploidy*(n_alleles + has_missing + hasEOV + 1) + has_mixed_phasing)) - S32 BYTE_limit = pow(2, 8*sizeof(BYTE) - (ploidy*shift + add)) - 1; - S32 U16_limit = pow(2, 8*sizeof(U16) - (ploidy*shift + add)) - 1; - S64 U32_limit = pow(2, 8*sizeof(U32) - (ploidy*shift + add)) - 1; - U64 U64_limit = pow(2, 8*sizeof(U64) - (ploidy*shift + add)) - 1; - if(BYTE_limit <= 0) BYTE_limit = std::numeric_limits::max(); - if(U16_limit <= 0) U16_limit = std::numeric_limits::max(); - if(U32_limit <= 0) U32_limit = std::numeric_limits::max(); - - U32 n_runs_byte = 0; U32 run_length_byte = 1; - U32 n_runs_u16 = 0; U32 run_length_u16 = 1; - U32 n_runs_u32 = 0; U32 run_length_u32 = 1; - U64 n_runs_u64 = 0; U64 run_length_u64 = 1; - - // Setup first - const char* const data = &bcf_entry.data[bcf_entry.formatID[0].l_offset]; - BYTE allele1 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0]]); - BYTE allele2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0] + sizeof(BYTE)]); - const bool phase = allele2 & 1; - if((allele1 >> 1) == 0) allele1 = 0; - else if(allele1 == 0x81) allele1 = 1; - else allele1 = (allele1 >> 1) + 1; - - if((allele2 >> 1) == 0) allele2 = 0; - else if(allele2 == 0x81) allele2 = 1; - else allele2 = (allele2 >> 1) + 1; - U32 ref = YON_PACK_GT_DIPLOID_NALLELIC(allele2, allele1, shift, add, phase); - - U32 ppa_pos = 1; - for(U32 i = ploidy; i < this->n_samples * ploidy; i += ploidy){ - BYTE allele1 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos]]); - BYTE allele2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos] + sizeof(BYTE)]); - const bool phase = allele2 & 1; - - if((allele1 >> 1) == 0) allele1 = 0; - else if(allele1 == 0x81) allele1 = 1; - else allele1 = (allele1 >> 1) + 1; - - if((allele2 >> 1) == 0) allele2 = 0; - else if(allele2 == 0x81) allele2 = 1; - else allele2 = (allele2 >> 1) + 1; - - const U32 internal = YON_PACK_GT_DIPLOID_NALLELIC(allele2, allele1, shift, add, phase); - - if(ref != internal){ - ref = internal; - ++n_runs_byte; run_length_byte = 0; - ++n_runs_u16; run_length_u16 = 0; - ++n_runs_u32; run_length_u32 = 0; - ++n_runs_u64; run_length_u64 = 0; + if(hash_value != hash_value_ref){ + for(U32 k = 4; k < 8; ++k) ++n_runs[k]; + for(U32 k = 4; k < 8; ++k) l_runs[k] = 0; + hash_value_ref = hash_value; } // Overflow: trigger a break - if(run_length_byte == BYTE_limit){ ++n_runs_byte; run_length_byte = 0; } - if(run_length_u16 == U16_limit) { ++n_runs_u16; run_length_u16 = 0; } - if(run_length_u32 == U32_limit) { ++n_runs_u32; run_length_u32 = 0; } - if(run_length_u64 == U64_limit) { ++n_runs_u64; run_length_u64 = 0; } - - // Update all counts - ++run_length_byte; - ++run_length_u16; - ++run_length_u32; - ++run_length_u64; - ++ppa_pos; + for(U32 k = 4; k < 8; ++k){ + if(l_runs[k] == limits[k-4]){ ++n_runs[k]; l_runs[k] = 0; } + ++l_runs[k]; + } + } + assert(l_gt_offset == l_gt); + for(U32 k = 0; k < 8; ++k) ++n_runs[k]; + + yon_gt_assess sum; + for(U32 k = 0; k < 4; ++k){ + sum.n_runs[k] = n_runs[k]; + sum.n_cost[k] = n_runs[k]*(k+1); + } + for(U32 k = 4; k < 8; ++k){ + sum.n_runs[k] = n_runs[k]; + sum.n_cost[k] = n_runs[k]*((k-4)+1); } - // Final runs - ++n_runs_byte; - ++n_runs_u16; - ++n_runs_u32; - ++n_runs_u64; - - // Determine best action - U32 smallest_cost = n_runs_byte*sizeof(BYTE); - U64 chosen_runs = n_runs_byte; - BYTE word_width = 1; - if(BYTE_limit == std::numeric_limits::max()) smallest_cost = std::numeric_limits::max(); - if(n_runs_u16*sizeof(U16) < smallest_cost){ smallest_cost = n_runs_u16*sizeof(U16); word_width = sizeof(U16); chosen_runs = n_runs_u16; } - if(n_runs_u32*sizeof(U32) < smallest_cost){ smallest_cost = n_runs_u32*sizeof(U32); word_width = sizeof(U32); chosen_runs = n_runs_u32; } - if(n_runs_u64*sizeof(U64) < smallest_cost){ smallest_cost = n_runs_u64*sizeof(U64); word_width = sizeof(U64); chosen_runs = n_runs_u64; } - - assert(ppa_pos == n_samples); - return(rle_helper_type(word_width, chosen_runs)); + sum.method = 2; + + /* + std::cout << entry->pos + 1 << "\tX\t" << this->n_samples; + for(U32 i = 0; i < 4; ++i) + std::cout << "\t" << n_runs[i] << "\t" << n_runs[i]*(i+1)*base_ploidy; + for(U32 i = 4; i < 8; ++i) + std::cout << "\t" << n_runs[i] << "\t" << n_runs[i]*((i-4)+1)*base_ploidy; + std::cout << std::endl; + */ + + return sum; } void GenotypeEncoder::updateStatistics(const GenotypeEncoderSlaveHelper& helper){ diff --git a/lib/algorithm/compression/genotype_encoder.h b/lib/algorithm/compression/genotype_encoder.h index 938b0b0..1a2a085 100644 --- a/lib/algorithm/compression/genotype_encoder.h +++ b/lib/algorithm/compression/genotype_encoder.h @@ -6,22 +6,50 @@ #include #include -#include "core/genotype_summary.h" -#include "io/bcf/BCFReader.h" #include "containers/variant_block.h" #include "core/variant_controller.h" +#include "core/genotypes.h" +#include "io/vcf_utils.h" +#include "containers/vcf_container.h" namespace tachyon{ namespace algorithm{ +const BYTE BCF_UNPACK_TACHYON[3] = {2, 0, 1}; +#define BCF_UNPACK_GENOTYPE(A) BCF_UNPACK_TACHYON[((A) >> 1)] +const char BCF_TYPE_SIZE[8] = {0,1,2,4,0,4,0,1}; + #define ENCODER_GT_DEBUG 0 -#define YON_PACK_GT_DIPLOID(A, B, SHIFT, ADD) (bcf::BCF_UNPACK_GENOTYPE(A) << ((SHIFT) + (ADD))) | (bcf::BCF_UNPACK_GENOTYPE(B) << (ADD)) | ((A) & (ADD)) +#define YON_PACK_GT_DIPLOID(A, B, SHIFT, ADD) (BCF_UNPACK_GENOTYPE(A) << ((SHIFT) + (ADD))) | (BCF_UNPACK_GENOTYPE(B) << (ADD)) | ((A) & (ADD)) #define YON_PACK_GT_DIPLOID_NALLELIC(A, B, SHIFT, ADD, PHASE) ((A) << ((SHIFT) + (ADD))) | ((B) << (ADD)) | ((PHASE) & (ADD)) +struct yon_gt_assess { + uint8_t GetCheapestPrimitive(void) const{ + uint64_t n_cost_best = this->n_cost[0]; + uint8_t best_primitive = 0; + for(int i = 1; i < 4; ++i){ + if(this->n_cost[i] < n_cost_best){ + n_cost_best = this->n_cost[i]; + best_primitive = i; + } + } + return(best_primitive); + } + + uint8_t method; + uint64_t n_runs[8]; + uint64_t n_cost[8]; +}; + struct GenotypeEncoderStatistics{ - GenotypeEncoderStatistics(){} + GenotypeEncoderStatistics(){ + memset(this->rle_counts, 0, sizeof(U64)*4); + memset(this->rle_simple_counts, 0, sizeof(U64)*4); + memset(this->diploid_bcf_counts, 0, sizeof(U64)*3); + memset(this->bcf_counts, 0, sizeof(U64)*3); + } - const U64 getTotal(void) const{ + U64 getTotal(void) const{ U64 total = 0; for(U32 i = 0; i < 4; ++i) total += this->rle_counts[i]; for(U32 i = 0; i < 4; ++i) total += this->rle_simple_counts[i]; @@ -65,58 +93,58 @@ struct GenotypeEncoderSlaveHelper{ // Overload operator += for block and RTYPE helper friend block_type& operator+=(block_type& block, const self_type& helper){ - block.gt_support_data_container.Add((U32)helper.n_runs); - ++block.gt_support_data_container; + block.base_containers[YON_BLK_GT_SUPPORT].Add((U32)helper.n_runs); + ++block.base_containers[YON_BLK_GT_SUPPORT]; if(helper.encoding_type == YON_GT_RLE_DIPLOID_BIALLELIC){ if(helper.gt_primitive == YON_GT_BYTE){ - block.gt_rle8_container += helper.container; - ++block.gt_rle8_container; + block.base_containers[YON_BLK_GT_INT8] += helper.container; + ++block.base_containers[YON_BLK_GT_INT8]; } else if(helper.gt_primitive == YON_GT_U16){ - block.gt_rle16_container += helper.container; - ++block.gt_rle16_container; + block.base_containers[YON_BLK_GT_INT16] += helper.container; + ++block.base_containers[YON_BLK_GT_INT16]; } else if(helper.gt_primitive == YON_GT_U32){ - block.gt_rle32_container += helper.container; - ++block.gt_rle32_container; + block.base_containers[YON_BLK_GT_INT32] += helper.container; + ++block.base_containers[YON_BLK_GT_INT32]; } else if(helper.gt_primitive == YON_GT_U64){ - block.gt_rle64_container += helper.container; - ++block.gt_rle64_container; + block.base_containers[YON_BLK_GT_INT64] += helper.container; + ++block.base_containers[YON_BLK_GT_INT64]; } } else if(helper.encoding_type == YON_GT_RLE_DIPLOID_NALLELIC){ if(helper.gt_primitive == YON_GT_BYTE){ - block.gt_simple8_container += helper.container; - ++block.gt_simple8_container; + block.base_containers[YON_BLK_GT_S_INT8] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT8]; } else if(helper.gt_primitive == YON_GT_U16){ - block.gt_simple16_container += helper.container; - ++block.gt_simple16_container; + block.base_containers[YON_BLK_GT_S_INT16] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT16]; } else if(helper.gt_primitive == YON_GT_U32){ - block.gt_simple32_container += helper.container; - ++block.gt_simple32_container; + block.base_containers[YON_BLK_GT_S_INT32] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT32]; } else if(helper.gt_primitive == YON_GT_U64){ - block.gt_simple64_container += helper.container; - ++block.gt_simple64_container; + block.base_containers[YON_BLK_GT_S_INT64] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT64]; } } else if(helper.encoding_type == YON_GT_BCF_DIPLOID){ if(helper.gt_primitive == YON_GT_BYTE){ - block.gt_simple8_container += helper.container; - ++block.gt_simple8_container; + block.base_containers[YON_BLK_GT_S_INT8] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT8]; } else if(helper.gt_primitive == YON_GT_U16){ - block.gt_simple16_container += helper.container; - ++block.gt_simple16_container; + block.base_containers[YON_BLK_GT_S_INT16] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT16]; } else if(helper.gt_primitive == YON_GT_U32){ - block.gt_simple32_container += helper.container; - ++block.gt_simple32_container; + block.base_containers[YON_BLK_GT_S_INT32] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT32]; } } else if(helper.encoding_type == YON_GT_BCF_STYLE){ if(helper.gt_primitive == YON_GT_BYTE){ - block.gt_simple8_container += helper.container; - ++block.gt_simple8_container; + block.base_containers[YON_BLK_GT_S_INT8] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT8]; } else if(helper.gt_primitive == YON_GT_U16){ - block.gt_simple16_container += helper.container; - ++block.gt_simple16_container; + block.base_containers[YON_BLK_GT_S_INT16] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT16]; } else if(helper.gt_primitive == YON_GT_U32){ - block.gt_simple32_container += helper.container; - ++block.gt_simple32_container; + block.base_containers[YON_BLK_GT_S_INT32] += helper.container; + ++block.base_containers[YON_BLK_GT_S_INT32]; } } @@ -131,11 +159,9 @@ struct GenotypeEncoderSlaveHelper{ }; class GenotypeEncoder { -private: +public: typedef GenotypeEncoder self_type; typedef io::BasicBuffer buffer_type; - typedef bcf::BCFReader bcf_reader_type; - typedef bcf::BCFEntry bcf_type; typedef core::MetaEntry meta_type; typedef containers::DataContainer container_type; typedef containers::VariantBlock block_type; @@ -162,25 +188,37 @@ class GenotypeEncoder { GenotypeEncoder(); GenotypeEncoder(const U64 samples); ~GenotypeEncoder(); - bool Encode(const bcf_type& bcf_entry, meta_type& meta, block_type& block, const U32* const ppa); - bool EncodeParallel(const bcf_reader_type& bcf_reader, meta_type* meta_entries, block_type& block, const U32* const ppa, const U32 n_threads); - bool EncodeParallel(const bcf_type& bcf_entry, meta_type& meta, const U32* const ppa, GenotypeEncoderSlaveHelper& slave_helper) const; - inline void setSamples(const U64 samples){ this->n_samples = samples; } - inline const stats_type& getUsageStats(void) const{ return(this->stats_); } -private: - const rle_helper_type assessDiploidRLEBiallelic(const bcf_type& bcf_entry, const U32* const ppa) const; - const rle_helper_type assessDiploidRLEnAllelic(const bcf_type& bcf_entry, const U32* const ppa) const; - const rle_helper_type assessMploidRLEBiallelic(const bcf_type& bcf_entry, const U32* const ppa) const; - const rle_helper_type assessMploidRLEnAllelic(const bcf_type& bcf_entry, const U32* const ppa) const; - - template bool EncodeBCFStyle(const bcf_type& bcf_entry, container_type& container, U64& n_runs) const; - template bool EncodeDiploidBCF(const bcf_type& bcf_entry, container_type& runs, U64& n_runs, const U32* const ppa) const; - template bool EncodeDiploidRLEBiallelic(const bcf_type& bcf_entry, container_type& runs, const U32* const ppa, const rle_helper_type& helper) const; - template bool EncodeDiploidRLEnAllelic(const bcf_type& bcf_entry, container_type& runs, const U32* const ppa, const rle_helper_type& helper) const; - template bool EncodeMploidRLEBiallelic(const bcf_type& bcf_entry, container_type& runs, U64& n_runs, const U32* const ppa) const; - template bool EncodeMploidRLENallelic(const bcf_type& bcf_entry, container_type& runs, U64& n_runs, const U32* const ppa) const; + inline void SetSamples(const U64 samples){ this->n_samples = samples; } + inline const stats_type& GetUsageStats(void) const{ return(this->stats_); } + + bool Encode(const containers::VcfContainer& container, meta_type* meta_entries, block_type& block, const yon_gt_ppa& permutation_array) const; + bool EncodeParallel(const containers::VcfContainer& container, meta_type* meta_entries, GenotypeEncoderSlaveHelper& slave_helper, const yon_gt_ppa& permutation_array) const; + + yon_gt_assess Assess(const bcf1_t* entry, const io::VcfGenotypeSummary& gt_summary, const yon_gt_ppa& permutation_array) const; + yon_gt_assess AssessDiploidBiallelic(const bcf1_t* entry, const io::VcfGenotypeSummary& gt_summary, const yon_gt_ppa& permutation_array) const; + yon_gt_assess AssessDiploidMultiAllelic(const bcf1_t* entry, const io::VcfGenotypeSummary& gt_summary, const yon_gt_ppa& permutation_array) const; + yon_gt_assess AssessMultiploid(const bcf1_t* entry, const io::VcfGenotypeSummary& gt_summary, const yon_gt_ppa& permutation_array) const; + + template + uint64_t EncodeDiploidBiallelic(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array, + container_type& dst) const; + + template + uint64_t EncodeDiploidMultiAllelic(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array, + container_type& dst) const; + template + uint64_t EncodeMultiploid(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array, + container_type& dst) const; + +private: /**< * Supportive reduce function for updating local import statistics * following parallel execution of `EncodeParallel`. Iteratively @@ -195,244 +233,248 @@ class GenotypeEncoder { stats_type stats_; }; -template -bool GenotypeEncoder::EncodeBCFStyle(const bcf_type& bcf_entry, - container_type& simple, - U64& n_runs) const +template +uint64_t GenotypeEncoder::EncodeDiploidBiallelic(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array, + container_type& dst) const { - const BYTE ploidy = bcf_entry.gt_support.ploidy; - U32 bcf_gt_pos = bcf_entry.formatID[0].l_offset; - const BCF_GT_TYPE missing_value = (BCF_GT_TYPE)1 << (sizeof(BCF_GT_TYPE)*8 - 1); - const BCF_GT_TYPE EOV_value = missing_value + 1; - - // Pack genotypes as - // allele | phasing - U32 j = 0; - for(U32 i = 0; i < this->n_samples * ploidy; i += ploidy, ++j){ - for(U32 p = 0; p < ploidy; ++p){ - const BCF_GT_TYPE& allele = *reinterpret_cast(&bcf_entry.data[bcf_gt_pos]); - if((allele >> 1) == 0) simple.AddLiteral((YON_STORE_TYPE)0); // missing - else if(allele == EOV_value) simple.AddLiteral((YON_STORE_TYPE)1); // eov - else { // otherwise - // Add 1 because 1 is reserved for EOV - const YON_STORE_TYPE val = ((allele >> 1) + 1) << 1 | (allele & 1); - simple.AddLiteral((YON_STORE_TYPE)val); - } - bcf_gt_pos += sizeof(BCF_GT_TYPE); - } - } + assert(entry->d.fmt[0].n == 2); + assert(entry->n_allele == 2); + assert(gt_summary.n_vector_end == 0); + const uint8_t base_ploidy = entry->d.fmt[0].n; + const uint8_t* gt = entry->d.fmt[0].p; + const uint32_t l_gt = entry->d.fmt[0].p_len; + assert(permutation_array.n_samples * base_ploidy == l_gt); + assert(base_ploidy * sizeof(uint8_t) * this->n_samples == l_gt); + + uint64_t n_runs = 0; // Number of runs. + uint64_t l_runs = 1; // Current run length. + + // 1 + hasMissing + hasMixedPhasing + const BYTE shift = gt_summary.n_missing ? 2 : 1; // 1-bits enough when no data missing {0,1}, 2-bits required when missing is available {0,1,2} + const BYTE add = gt_summary.mixed_phasing ? 1 : 0; - n_runs = this->n_samples*ploidy; - simple.header.n_additions += n_runs; + // Run limits + const uint64_t limit = pow(2, 8*sizeof(YON_RLE_TYPE) - (base_ploidy*shift + add)) - 1; + U32 rle_ppa_current_ref = YON_PACK_GT_DIPLOID(gt[permutation_array[0] * sizeof(int8_t) * base_ploidy], + gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + 1], + shift, add); + + // Iterate over all available samples. + for(U32 i = 1; i < this->n_samples; ++i){ + const uint8_t* gt_ppa_target = >[permutation_array[i] * sizeof(int8_t) * base_ploidy]; + U32 rle_ppa_current = YON_PACK_GT_DIPLOID(gt_ppa_target[0], gt_ppa_target[1], shift, add); + + if(rle_ppa_current != rle_ppa_current_ref || l_runs == limit){ + YON_RLE_TYPE RLE = l_runs; + RLE <<= (base_ploidy*shift + add); + RLE |= rle_ppa_current_ref; + assert((RLE >> (base_ploidy*shift + add)) == l_runs); - return(true); -} + // Push RLE to buffer + dst.AddLiteral((YON_RLE_TYPE)RLE); + ++dst.header.n_additions; -template -bool GenotypeEncoder::EncodeDiploidBCF(const bcf_type& bcf_entry, - container_type& simple, - U64& n_runs, - const U32* const ppa) const -{ - const BYTE ploidy = 2; - // Shift size is equivalent to floor((sizeof(T)*8 - 1)/2) - const BYTE shift_size = (sizeof(YON_RLE_TYPE)*8 - 1) / 2; - - // Start of GT byte stream - const char* const data = &bcf_entry.data[bcf_entry.formatID[0].l_offset]; - - const BCF_GT_TYPE missing_value = (BCF_GT_TYPE)1 << (sizeof(BCF_GT_TYPE)*8 - 1); - const BCF_GT_TYPE EOV_value = missing_value + 1; - // Pack genotypes as - // allele A | allele B | phasing information - U32 ppa_pos = 0; - YON_RLE_TYPE temp = 0; - for(U32 i = 0; i < this->n_samples * ploidy; i += ploidy){ - BCF_GT_TYPE allele1 = *reinterpret_cast(&data[ploidy*sizeof(BCF_GT_TYPE)*ppa[ppa_pos]]); - BCF_GT_TYPE allele2 = *reinterpret_cast(&data[ploidy*sizeof(BCF_GT_TYPE)*ppa[ppa_pos] + sizeof(BCF_GT_TYPE)]); - const bool phasing = allele2 & 1; - - if((allele1 >> 1) == 0) allele1 = 0; - else if(allele1 == EOV_value) allele1 = 1; - else allele1 = (allele1 >> 1) + 1; - if((allele2 >> 1) == 0) allele2 = 0; - else if(allele2 == EOV_value) allele2 = 1; - else allele2 = (allele2 >> 1) + 1; - - const YON_RLE_TYPE packed = (allele1 << (shift_size + 1)) | - (allele2 << 1) | - (phasing & 1); - - simple.AddLiteral((YON_RLE_TYPE)packed); - ++ppa_pos; + rle_ppa_current_ref = rle_ppa_current; + + l_runs = 0; + ++n_runs; + } + ++l_runs; } + ++n_runs; - n_runs = this->n_samples; - simple.header.n_additions += n_runs; + YON_RLE_TYPE RLE = l_runs; + RLE <<= (base_ploidy*shift + add); + RLE |= rle_ppa_current_ref; + assert((RLE >> (base_ploidy*shift + add)) == l_runs); - return(true); + // Push RLE to buffer + dst.AddLiteral((YON_RLE_TYPE)RLE); + ++dst.header.n_additions; + ++dst.header.n_entries; + + //std::cerr << n_runs << "\t" << dst.buffer_data_uncompressed.size() << std::endl; + + return n_runs; } template -bool GenotypeEncoder::EncodeDiploidRLEBiallelic(const bcf_type& bcf_entry, - container_type& runs, - const U32* const ppa, - const rle_helper_type& helper) const +uint64_t GenotypeEncoder::EncodeDiploidMultiAllelic(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array, + container_type& dst) const { - const BYTE ploidy = 2; - U32 sumLength = 0; - YON_RLE_TYPE length = 1; - YON_RLE_TYPE RLE = 0; - const BYTE shift = bcf_entry.gt_support.hasMissing ? 2 : 1; - const BYTE add = bcf_entry.gt_support.mixedPhasing ? 1 : 0; + assert(entry->d.fmt[0].n == 2); + const uint8_t base_ploidy = entry->d.fmt[0].n; + const uint8_t* gt = entry->d.fmt[0].p; + const uint32_t l_gt = entry->d.fmt[0].p_len; + assert(permutation_array.n_samples * base_ploidy == l_gt); + assert(base_ploidy * sizeof(uint8_t) * this->n_samples == l_gt); + + int64_t n_runs = 0; // Number of runs. + int64_t l_runs = 1; // Current run length. + + // Assess RLE cost + const BYTE shift = ceil(log2(entry->n_allele + 2 + 1)); + const BYTE add = gt_summary.mixed_phasing ? 1 : 0; // Run limits - const YON_RLE_TYPE run_limit = pow(2, 8*sizeof(YON_RLE_TYPE) - (ploidy*shift + add)) - 1; - const char* const data = &bcf_entry.data[bcf_entry.formatID[0].l_offset]; - const BYTE& allele1 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0]]); - const BYTE& allele2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0] + sizeof(BYTE)]); - YON_RLE_TYPE packed = YON_PACK_GT_DIPLOID(allele2, allele1, shift, add); - - U32 ppa_pos = 1; - U64 n_runs = 0; - for(U32 i = ploidy; i < this->n_samples * ploidy; i += ploidy){ - const BYTE& allele1 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos]]); - const BYTE& allele2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos] + sizeof(BYTE)]); - const YON_RLE_TYPE packed_internal = YON_PACK_GT_DIPLOID(allele2, allele1, shift, add); - - if(packed != packed_internal || length == run_limit){ - // Prepare RLE - RLE = length; - RLE <<= (ploidy*shift + add); - RLE |= packed; - assert((RLE >> (ploidy*shift + add)) == length); + // Values set to signed integers as values can underflow if + // the do not fit in the word size. + // Ploidy*shift_size bits for alleles and 1 bit for phase information (if required) + // Cost: 2^(8*word_width - (ploidy*(n_alleles + has_missing + hasEOV + 1) + has_mixed_phasing)) + int64_t limit = pow(2, 8*sizeof(YON_RLE_TYPE) - (base_ploidy*shift + add)) - 1; + assert(limit > 0); + + uint8_t gt_remap[256]; + memset(gt_remap, 256, 255); + for(U32 i = 0; i <= entry->n_allele; ++i){ + gt_remap[i << 1] = ((i+1) << 1); + gt_remap[(i << 1) + 1] = ((i+1) << 1) + 1; + } + gt_remap[0] = 0; + gt_remap[129] = 1; + + // Initial reference entry. + U32 rle_ppa_current_ref = YON_PACK_GT_DIPLOID_NALLELIC(gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy]] >> 1, + gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + 1]] >> 1, + shift, add, + gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + 1]]); + + // Iterate over all available samples. + for(U32 i = 1; i < this->n_samples; ++i){ + const uint8_t* gt_ppa_target = >[permutation_array[i] * sizeof(int8_t) * base_ploidy]; + U32 rle_ppa_current = YON_PACK_GT_DIPLOID_NALLELIC(gt_remap[gt_ppa_target[0]] >> 1, + gt_remap[gt_ppa_target[1]] >> 1, + shift, add, + gt_remap[gt_ppa_target[1]]); + + assert(gt_remap[gt_ppa_target[0]] != 255); + assert(gt_remap[gt_ppa_target[1]] != 255); + + if(rle_ppa_current != rle_ppa_current_ref || l_runs == limit){ + YON_RLE_TYPE RLE = l_runs; + RLE <<= (base_ploidy*shift + add); + RLE |= rle_ppa_current_ref; + assert((RLE >> (base_ploidy*shift + add)) == l_runs); // Push RLE to buffer - runs.AddLiteral((YON_RLE_TYPE)RLE); + dst.AddLiteral((YON_RLE_TYPE)RLE); + ++dst.header.n_additions; + + rle_ppa_current_ref = rle_ppa_current; - // Reset and update - sumLength += length; - length = 0; - packed = packed_internal; + l_runs = 0; ++n_runs; } - ++length; - ++ppa_pos; + + ++l_runs; } - // Last entry - // Prepare RLE - RLE = length; - RLE <<= (ploidy*shift + add); - RLE |= packed; - assert((RLE >> (ploidy*shift + add)) == length); + ++n_runs; + + YON_RLE_TYPE RLE = l_runs; + RLE <<= (base_ploidy*shift + add); + RLE |= rle_ppa_current_ref; + assert((RLE >> (base_ploidy*shift + add)) == l_runs); // Push RLE to buffer - runs.AddLiteral((YON_RLE_TYPE)RLE); - ++n_runs; + dst.AddLiteral((YON_RLE_TYPE)RLE); + ++dst.header.n_additions; + ++dst.header.n_entries; - // Reset and update - sumLength += length; - assert(sumLength == this->n_samples); - assert(helper.n_runs == n_runs); - runs.header.n_additions += n_runs; - assert(ppa_pos == n_samples); - -#if ENCODER_GT_DEBUG == 1 - std::cout << 0 << '\t' << n_runs << '\t' << sizeof(YON_RLE_TYPE) << '\n'; -#endif - return(true); + return n_runs; + + //std::cerr << n_runs << "\t" << dst.buffer_data_uncompressed.size() << std::endl; } template -bool GenotypeEncoder::EncodeDiploidRLEnAllelic(const bcf_type& bcf_entry, - container_type& runs, - const U32* const ppa, - const rle_helper_type& helper) const +uint64_t GenotypeEncoder::EncodeMultiploid(const bcf1_t* entry, + const io::VcfGenotypeSummary& gt_summary, + const yon_gt_ppa& permutation_array, + container_type& dst) const { - const BYTE ploidy = 2; - U32 sumLength = 0; - YON_RLE_TYPE length = 1; - YON_RLE_TYPE RLE = 0; - const BYTE shift = ceil(log2(bcf_entry.body->n_allele + 2 + 1)); // Bits occupied per allele - const BYTE add = bcf_entry.gt_support.mixedPhasing ? 1 : 0; - const YON_RLE_TYPE run_limit = pow(2, 8*sizeof(YON_RLE_TYPE) - (ploidy*shift + add)) - 1; - - // Setup first run - const char* const data = &bcf_entry.data[bcf_entry.formatID[0].l_offset]; - BYTE allele1 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0]]); - BYTE allele2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[0] + sizeof(BYTE)]); - const bool phase = allele2 & 1; - - if((allele1 >> 1) == 0) allele1 = 0; - else if(allele1 == 0x81) allele1 = 1; - else allele1 = (allele1 >> 1) + 1; - - if((allele2 >> 1) == 0) allele2 = 0; - else if(allele2 == 0x81) allele2 = 1; - else allele2 = (allele2 >> 1) + 1; - - YON_RLE_TYPE packed = YON_PACK_GT_DIPLOID_NALLELIC(allele2, allele1, shift, add, phase); - - U32 ppa_pos = 1; - U64 n_runs = 0; - for(U32 i = ploidy; i < this->n_samples * ploidy; i += ploidy){ - BYTE allele1 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos]]); - BYTE allele2 = *reinterpret_cast(&data[ploidy*sizeof(BYTE)*ppa[ppa_pos] + sizeof(BYTE)]); - const bool phase = allele2 & 1; - - if((allele1 >> 1) == 0) allele1 = 0; - else if(allele1 == 0x81) allele1 = 1; - else allele1 = (allele1 >> 1) + 1; - - if((allele2 >> 1) == 0) allele2 = 0; - else if(allele2 == 0x81) allele2 = 1; - else allele2 = (allele2 >> 1) + 1; - - const YON_RLE_TYPE packed_internal = YON_PACK_GT_DIPLOID_NALLELIC(allele2, allele1, shift, add, phase); - - - if(packed != packed_internal || length == run_limit){ - // Prepare RLE - RLE = (length << (ploidy*shift + add)) | packed; - assert((RLE >> (ploidy*shift + add)) == length); - assert(length != 0); + // This method is currently only valid if the genotypic + // data is stored as BCF_BT_INT8 in the htslib bcf1_t + // record. + assert(entry->d.fmt[0].type == BCF_BT_INT8); + const uint8_t base_ploidy = entry->d.fmt[0].n; + const uint8_t* gt = entry->d.fmt[0].p; + const uint32_t l_gt = entry->d.fmt[0].p_len; + const uint64_t limit = std::numeric_limits::max(); + assert(permutation_array.n_samples * base_ploidy == l_gt); + assert(base_ploidy * sizeof(uint8_t) * this->n_samples == l_gt); + + // Remap genotype encoding such that 0 maps to missing and + // 1 maps to the sentinel node symbol (EOV). + uint8_t gt_remap[256]; + memset(gt_remap, 256, 255); + for(U32 i = 0; i <= entry->n_allele; ++i){ + gt_remap[i << 1] = ((i+1) << 1); + gt_remap[(i << 1) + 1] = ((i+1) << 1) + 1; + } + gt_remap[0] = 0; + gt_remap[129] = 1; - // Push RLE to buffer - runs.AddLiteral((YON_RLE_TYPE)RLE); + // Start parameters for run-length encoding. + uint64_t n_runs = 0; // Number of runs. + YON_RLE_TYPE l_run = 1; // Current run length. + + // Keep track of the current reference sequence as we + // iterate over the available genotypes. + uint8_t* reference = new uint8_t[base_ploidy]; + for(U32 i = 0; i < base_ploidy; ++i) + reference[i] = gt_remap[gt[permutation_array[0] * sizeof(int8_t) * base_ploidy + i]]; + + // Hash of current reference genotype sequence. + uint64_t hash_value_ppa_ref = XXH64(>[permutation_array[0] * sizeof(int8_t) * base_ploidy], sizeof(int8_t) * base_ploidy, 89231478); + + // Iterate over all available samples. + for(U32 i = 1; i < this->n_samples; ++i){ + const uint8_t* gt_ppa_target = >[permutation_array[i] * sizeof(int8_t) * base_ploidy]; + uint64_t hash_value_ppa = XXH64(gt_ppa_target, sizeof(int8_t) * base_ploidy, 89231478); + + if(hash_value_ppa != hash_value_ppa_ref){ + dst.AddLiteral(l_run); + for(U32 k = 0; k < base_ploidy; ++k) dst.AddLiteral(reference[k]); + ++dst.header.n_additions; - // Reset and update - sumLength += length; - length = 0; - packed = packed_internal; ++n_runs; + l_run = 0; + hash_value_ppa_ref = hash_value_ppa; + for(U32 k = 0; k < base_ploidy; ++k) reference[k] = gt_remap[gt_ppa_target[k]]; } - ++length; - ++ppa_pos; - } - // Last entry - // Prepare RLE - RLE = (length << (ploidy*shift + add)) | packed; - assert((RLE >> (ploidy*shift + add)) == length); - assert(length != 0); - // Push RLE to buffer - runs.AddLiteral((YON_RLE_TYPE)RLE); + // Overflow: trigger a break + if(l_run == limit){ + dst.AddLiteral(l_run); + for(U32 k = 0; k < base_ploidy; ++k) dst.AddLiteral(reference[k]); + ++dst.header.n_additions; + + ++n_runs; + l_run = 0; + } + ++l_run; + } + dst.AddLiteral(l_run); + for(U32 k = 0; k < base_ploidy; ++k) dst.AddLiteral(reference[k]); + ++dst.header.n_additions; + ++dst.header.n_entries; + + //std::cerr << l_run << ":" << (U32)reference[0]; + //for(U32 k = 1; k < base_ploidy; ++k) std::cerr << "," << (U32)reference[k]; + //std::cerr << std::endl; ++n_runs; - // Reset and update - sumLength += length; - assert(sumLength == this->n_samples); - assert(helper.n_runs == n_runs); - assert(ppa_pos == n_samples); - runs.header.n_additions += n_runs; + //std::cerr << n_runs << "\t" << dst.buffer_data_uncompressed.size() << std::endl; -#if ENCODER_GT_DEBUG == 1 - std::cout << 1 << '\t' << n_runs << '\t' << sizeof(YON_RLE_TYPE) << '\n'; -#endif + delete [] reference; - return(true); + return n_runs; } + /**< * Parallel support structure: this object encapsulates * a thread that runs the `EncodeParallel` function with @@ -440,42 +482,58 @@ bool GenotypeEncoder::EncodeDiploidRLEnAllelic(const bcf_type& bcf_entry, */ struct CalcSlave{ typedef CalcSlave self_type; - typedef bcf::BCFEntry bcf_type; - typedef bcf::BCFReader bcf_reader_type; typedef core::MetaEntry meta_type; typedef GenotypeEncoderSlaveHelper helper_type; - CalcSlave(){} + CalcSlave() : + thread_idx_(0), + n_threads_(0), + encoder_(nullptr), + //reader_(nullptr), + meta_entries_(nullptr), + ppa_(nullptr), + helpers_(nullptr) + {} + ~CalcSlave(){} - std::thread* Start(const GenotypeEncoder& encoder, const U32 thread_idx, const U32 n_threads, const bcf_reader_type& reader, meta_type* meta_entries, const U32* const ppa, helper_type* helpers){ - this->encoder = &encoder; - this->thread_idx = thread_idx; - this->n_threads = n_threads; - this->reader = &reader; - this->meta_entries = meta_entries; - this->ppa = ppa; - this->helpers = helpers; + std::thread* Start(const GenotypeEncoder& encoder, + const U32 thread_idx, + const U32 n_threads, + //const bcf_reader_type& reader, + meta_type* meta_entries, + const U32* const ppa, + helper_type* helpers) + { + this->encoder_ = &encoder; + this->thread_idx_ = thread_idx; + this->n_threads_ = n_threads; + //this->reader_ = &reader; + this->meta_entries_ = meta_entries; + this->ppa_ = ppa; + this->helpers_ = helpers; this->thread = std::thread(&self_type::Run_, this); return(&this->thread); } private: - const GenotypeEncoder* encoder; - U32 thread_idx; - U32 n_threads; - const bcf_reader_type* reader; - meta_type* meta_entries; - const U32* ppa; - helper_type* helpers; - void Run_(void){ - for(U32 i = this->thread_idx; i < this->reader->size(); i += this->n_threads){ - encoder->EncodeParallel((*reader)[i], meta_entries[i], ppa, helpers[i]); + exit(1); + for(U32 i = this->thread_idx_; i < 5; i += this->n_threads_){ + //this->encoder_->EncodeParallel((*this->reader_)[i], this->meta_entries_[i], this->ppa_, this->helpers_[i]); } } +private: + U32 thread_idx_; + U32 n_threads_; + const GenotypeEncoder* encoder_; + //const bcf_reader_type* reader_; + meta_type* meta_entries_; + const U32* ppa_; + helper_type* helpers_; + public: std::thread thread; }; diff --git a/lib/algorithm/compression/libzpaq.cpp b/lib/algorithm/compression/libzpaq.cpp deleted file mode 100644 index 651011c..0000000 --- a/lib/algorithm/compression/libzpaq.cpp +++ /dev/null @@ -1,7753 +0,0 @@ -/* - * libzpaq.cpp - * - * Created on: 6 Mar 2018 - * Author: mklarqvist - */ - - -/* libzpaq.cpp - LIBZPAQ Version 7.15 implementation - Aug. 17, 2016. - - libdivsufsort.c for divsufsort 2.00, included within, is - (C) 2003-2008 Yuta Mori, all rights reserved. - It is released under the MIT license as described in the comments - at the beginning of that section. - - Some of the code for AES is from libtomcrypt 1.17 by Tom St. Denis - and is public domain. - - The Salsa20/8 code for Scrypt is by D. Bernstein and is public domain. - - All of the remaining software is provided as-is, with no warranty. - I, Matt Mahoney, release this software into - the public domain. This applies worldwide. - In some countries this may not be legally possible; if so: - I grant anyone the right to use this software for any purpose, - without any conditions, unless such conditions are required by law. - -LIBZPAQ is a C++ library for compression and decompression of data -conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/ -See libzpaq.h for additional documentation. -*/ - -#include "libzpaq.h" - -#include -#include -#include -#include -#include - -#define unix 1 - -#ifdef unix -#ifndef NOJIT -#include -#endif -#else -#include -#include -#endif - -namespace libzpaq { - -// Read 16 bit little-endian number -int toU16(const char* p) { - return (p[0]&255)+256*(p[1]&255); -} - -// Default read() and write() -int Reader::read(char* buf, int n) { - int i=0, c; - while (i=0) - buf[i++]=c; - return i; -} - -void Writer::write(const char* buf, int n) { - for (int i=0; i 0 bytes of executable memory and update -// p to point to it and newsize = n. Free any previously -// allocated memory first. If newsize is 0 then free only. -// Call error in case of failure. If NOJIT, ignore newsize -// and set p=0, n=0 without allocating memory. -void allocx(U8* &p, int &n, int newsize) { -#ifdef NOJIT - p=0; - n=0; -#else - if (p || n) { - if (p) -#ifdef unix - munmap(p, n); -#else // Windows - VirtualFree(p, 0, MEM_RELEASE); -#endif - p=0; - n=0; - } - if (newsize>0) { -#ifdef unix - p=(U8*)mmap(0, newsize, PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_PRIVATE|MAP_ANON, -1, 0); - if ((void*)p==MAP_FAILED) p=0; -#else - p=(U8*)VirtualAlloc(0, newsize, MEM_RESERVE|MEM_COMMIT, - PAGE_EXECUTE_READWRITE); -#endif - if (p) - n=newsize; - else { - n=0; - error("allocx failed"); - } - } -#endif -} - -//////////////////////////// SHA1 //////////////////////////// - -// SHA1 code, see http://en.wikipedia.org/wiki/SHA-1 - -// Start a new hash -void SHA1::init() { - len=0; - h[0]=0x67452301; - h[1]=0xEFCDAB89; - h[2]=0x98BADCFE; - h[3]=0x10325476; - h[4]=0xC3D2E1F0; - memset(w, 0, sizeof(w)); -} - -// Return old result and start a new hash -const char* SHA1::result() { - - // pad and append length - const U64 s=len; - put(0x80); - while ((len&511)!=448) - put(0); - put(s>>56); - put(s>>48); - put(s>>40); - put(s>>32); - put(s>>24); - put(s>>16); - put(s>>8); - put(s); - - // copy h to hbuf - for (int i=0; i<5; ++i) { - hbuf[4*i]=h[i]>>24; - hbuf[4*i+1]=h[i]>>16; - hbuf[4*i+2]=h[i]>>8; - hbuf[4*i+3]=h[i]; - } - - // return hash prior to clearing state - init(); - return hbuf; -} - -// Hash buf[0..n-1] -void SHA1::write(const char* buf, int64_t n) { - const unsigned char* p=(const unsigned char*) buf; - for (; n>0 && (U32(len)&511)!=0; --n) put(*p++); - for (; n>=64; n-=64) { - for (int i=0; i<16; ++i) - w[i]=p[0]<<24|p[1]<<16|p[2]<<8|p[3], p+=4; - len+=512; - process(); - } - for (; n>0; --n) put(*p++); -} - -// Hash 1 block of 64 bytes -void SHA1::process() { - U32 a=h[0], b=h[1], c=h[2], d=h[3], e=h[4]; - static const U32 k[4]={0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6}; - #define f(a,b,c,d,e,i) \ - if (i>=16) \ - w[(i)&15]^=w[(i-3)&15]^w[(i-8)&15]^w[(i-14)&15], \ - w[(i)&15]=w[(i)&15]<<1|w[(i)&15]>>31; \ - e+=(a<<5|a>>27)+k[(i)/20]+w[(i)&15] \ - +((i)%40>=20 ? b^c^d : i>=40 ? (b&c)|(d&(b|c)) : d^(b&(c^d))); \ - b=b<<30|b>>2; - #define r(i) f(a,b,c,d,e,i) f(e,a,b,c,d,i+1) f(d,e,a,b,c,i+2) \ - f(c,d,e,a,b,i+3) f(b,c,d,e,a,i+4) - r(0) r(5) r(10) r(15) r(20) r(25) r(30) r(35) - r(40) r(45) r(50) r(55) r(60) r(65) r(70) r(75) - #undef f - #undef r - h[0]+=a; h[1]+=b; h[2]+=c; h[3]+=d; h[4]+=e; -} - -//////////////////////////// SHA256 ////////////////////////// - -void SHA256::init() { - len0=len1=0; - s[0]=0x6a09e667; - s[1]=0xbb67ae85; - s[2]=0x3c6ef372; - s[3]=0xa54ff53a; - s[4]=0x510e527f; - s[5]=0x9b05688c; - s[6]=0x1f83d9ab; - s[7]=0x5be0cd19; - memset(w, 0, sizeof(w)); -} - -void SHA256::process() { - - #define ror(a,b) ((a)>>(b)|(a<<(32-(b)))) - - #define m(i) \ - w[(i)&15]+=w[(i-7)&15] \ - +(ror(w[(i-15)&15],7)^ror(w[(i-15)&15],18)^(w[(i-15)&15]>>3)) \ - +(ror(w[(i-2)&15],17)^ror(w[(i-2)&15],19)^(w[(i-2)&15]>>10)) - - #define r(a,b,c,d,e,f,g,h,i) { \ - unsigned t1=ror(e,14)^e; \ - t1=ror(t1,5)^e; \ - h+=ror(t1,6)+((e&f)^(~e&g))+k[i]+w[(i)&15]; } \ - d+=h; \ - {unsigned t1=ror(a,9)^a; \ - t1=ror(t1,11)^a; \ - h+=ror(t1,2)+((a&b)^(c&(a^b))); } - - #define mr(a,b,c,d,e,f,g,h,i) m(i); r(a,b,c,d,e,f,g,h,i); - - #define r8(i) \ - r(a,b,c,d,e,f,g,h,i); \ - r(h,a,b,c,d,e,f,g,i+1); \ - r(g,h,a,b,c,d,e,f,i+2); \ - r(f,g,h,a,b,c,d,e,i+3); \ - r(e,f,g,h,a,b,c,d,i+4); \ - r(d,e,f,g,h,a,b,c,i+5); \ - r(c,d,e,f,g,h,a,b,i+6); \ - r(b,c,d,e,f,g,h,a,i+7); - - #define mr8(i) \ - mr(a,b,c,d,e,f,g,h,i); \ - mr(h,a,b,c,d,e,f,g,i+1); \ - mr(g,h,a,b,c,d,e,f,i+2); \ - mr(f,g,h,a,b,c,d,e,i+3); \ - mr(e,f,g,h,a,b,c,d,i+4); \ - mr(d,e,f,g,h,a,b,c,i+5); \ - mr(c,d,e,f,g,h,a,b,i+6); \ - mr(b,c,d,e,f,g,h,a,i+7); - - static const unsigned k[64]={ - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; - - unsigned a=s[0]; - unsigned b=s[1]; - unsigned c=s[2]; - unsigned d=s[3]; - unsigned e=s[4]; - unsigned f=s[5]; - unsigned g=s[6]; - unsigned h=s[7]; - - r8(0); - r8(8); - mr8(16); - mr8(24); - mr8(32); - mr8(40); - mr8(48); - mr8(56); - - s[0]+=a; - s[1]+=b; - s[2]+=c; - s[3]+=d; - s[4]+=e; - s[5]+=f; - s[6]+=g; - s[7]+=h; - - #undef mr8 - #undef r8 - #undef mr - #undef r - #undef m - #undef ror -}; - -// Return old result and start a new hash -const char* SHA256::result() { - - // pad and append length - const unsigned s1=len1, s0=len0; - put(0x80); - while ((len0&511)!=448) put(0); - put(s1>>24); - put(s1>>16); - put(s1>>8); - put(s1); - put(s0>>24); - put(s0>>16); - put(s0>>8); - put(s0); - - // copy s to hbuf - for (int i=0; i<8; ++i) { - hbuf[4*i]=s[i]>>24; - hbuf[4*i+1]=s[i]>>16; - hbuf[4*i+2]=s[i]>>8; - hbuf[4*i+3]=s[i]; - } - - // return hash prior to clearing state - init(); - return hbuf; -} - -//////////////////////////// AES ///////////////////////////// - -// Some AES code is derived from libtomcrypt 1.17 (public domain). - -#define Te4_0 0x000000FF & Te4 -#define Te4_1 0x0000FF00 & Te4 -#define Te4_2 0x00FF0000 & Te4 -#define Te4_3 0xFF000000 & Te4 - -// Extract byte n of x -static inline unsigned byte(unsigned x, unsigned n) {return (x>>(8*n))&255;} - -// x = y[0..3] MSB first -static inline void LOAD32H(U32& x, const char* y) { - const unsigned char* u=(const unsigned char*)y; - x=u[0]<<24|u[1]<<16|u[2]<<8|u[3]; -} - -// y[0..3] = x MSB first -static inline void STORE32H(U32& x, unsigned char* y) { - y[0]=x>>24; - y[1]=x>>16; - y[2]=x>>8; - y[3]=x; -} - -#define setup_mix(temp) \ - ((Te4_3[byte(temp, 2)]) ^ (Te4_2[byte(temp, 1)]) ^ \ - (Te4_1[byte(temp, 0)]) ^ (Te4_0[byte(temp, 3)])) - -// Initialize encryption tables and round key. keylen is 16, 24, or 32. -AES_CTR::AES_CTR(const char* key, int keylen, const char* iv) { - assert(key != NULL); - assert(keylen==16 || keylen==24 || keylen==32); - - // Initialize IV (default 0) - iv0=iv1=0; - if (iv) { - LOAD32H(iv0, iv); - LOAD32H(iv1, iv+4); - } - - // Initialize encryption tables - for (int i=0; i<256; ++i) { - unsigned s1= - "\x63\x7c\x77\x7b\xf2\x6b\x6f\xc5\x30\x01\x67\x2b\xfe\xd7\xab\x76" - "\xca\x82\xc9\x7d\xfa\x59\x47\xf0\xad\xd4\xa2\xaf\x9c\xa4\x72\xc0" - "\xb7\xfd\x93\x26\x36\x3f\xf7\xcc\x34\xa5\xe5\xf1\x71\xd8\x31\x15" - "\x04\xc7\x23\xc3\x18\x96\x05\x9a\x07\x12\x80\xe2\xeb\x27\xb2\x75" - "\x09\x83\x2c\x1a\x1b\x6e\x5a\xa0\x52\x3b\xd6\xb3\x29\xe3\x2f\x84" - "\x53\xd1\x00\xed\x20\xfc\xb1\x5b\x6a\xcb\xbe\x39\x4a\x4c\x58\xcf" - "\xd0\xef\xaa\xfb\x43\x4d\x33\x85\x45\xf9\x02\x7f\x50\x3c\x9f\xa8" - "\x51\xa3\x40\x8f\x92\x9d\x38\xf5\xbc\xb6\xda\x21\x10\xff\xf3\xd2" - "\xcd\x0c\x13\xec\x5f\x97\x44\x17\xc4\xa7\x7e\x3d\x64\x5d\x19\x73" - "\x60\x81\x4f\xdc\x22\x2a\x90\x88\x46\xee\xb8\x14\xde\x5e\x0b\xdb" - "\xe0\x32\x3a\x0a\x49\x06\x24\x5c\xc2\xd3\xac\x62\x91\x95\xe4\x79" - "\xe7\xc8\x37\x6d\x8d\xd5\x4e\xa9\x6c\x56\xf4\xea\x65\x7a\xae\x08" - "\xba\x78\x25\x2e\x1c\xa6\xb4\xc6\xe8\xdd\x74\x1f\x4b\xbd\x8b\x8a" - "\x70\x3e\xb5\x66\x48\x03\xf6\x0e\x61\x35\x57\xb9\x86\xc1\x1d\x9e" - "\xe1\xf8\x98\x11\x69\xd9\x8e\x94\x9b\x1e\x87\xe9\xce\x55\x28\xdf" - "\x8c\xa1\x89\x0d\xbf\xe6\x42\x68\x41\x99\x2d\x0f\xb0\x54\xbb\x16" - [i]&255; - unsigned s2=s1<<1; - if (s2>=0x100) s2^=0x11b; - unsigned s3=s1^s2; - Te0[i]=s2<<24|s1<<16|s1<<8|s3; - Te1[i]=s3<<24|s2<<16|s1<<8|s1; - Te2[i]=s1<<24|s3<<16|s2<<8|s1; - Te3[i]=s1<<24|s1<<16|s3<<8|s2; - Te4[i]=s1<<24|s1<<16|s1<<8|s1; - } - - // setup the forward key - Nr = 10 + ((keylen/8)-2)*2; // 10, 12, or 14 rounds - int i = 0; - U32* rk = &ek[0]; - U32 temp; - static const U32 rcon[10] = { - 0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL, - 0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL, - 0x1B000000UL, 0x36000000UL}; // round constants - - LOAD32H(rk[0], key ); - LOAD32H(rk[1], key + 4); - LOAD32H(rk[2], key + 8); - LOAD32H(rk[3], key + 12); - if (keylen == 16) { - for (;;) { - temp = rk[3]; - rk[4] = rk[0] ^ setup_mix(temp) ^ rcon[i]; - rk[5] = rk[1] ^ rk[4]; - rk[6] = rk[2] ^ rk[5]; - rk[7] = rk[3] ^ rk[6]; - if (++i == 10) { - break; - } - rk += 4; - } - } - else if (keylen == 24) { - LOAD32H(rk[4], key + 16); - LOAD32H(rk[5], key + 20); - for (;;) { - temp = rk[5]; - rk[ 6] = rk[ 0] ^ setup_mix(temp) ^ rcon[i]; - rk[ 7] = rk[ 1] ^ rk[ 6]; - rk[ 8] = rk[ 2] ^ rk[ 7]; - rk[ 9] = rk[ 3] ^ rk[ 8]; - if (++i == 8) { - break; - } - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; - rk += 6; - } - } - else if (keylen == 32) { - LOAD32H(rk[4], key + 16); - LOAD32H(rk[5], key + 20); - LOAD32H(rk[6], key + 24); - LOAD32H(rk[7], key + 28); - for (;;) { - temp = rk[7]; - rk[ 8] = rk[ 0] ^ setup_mix(temp) ^ rcon[i]; - rk[ 9] = rk[ 1] ^ rk[ 8]; - rk[10] = rk[ 2] ^ rk[ 9]; - rk[11] = rk[ 3] ^ rk[10]; - if (++i == 7) { - break; - } - temp = rk[11]; - rk[12] = rk[ 4] ^ setup_mix(temp<<24|temp>>8); - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; - rk += 8; - } - } -} - -// Encrypt to ct[16] -void AES_CTR::encrypt(U32 s0, U32 s1, U32 s2, U32 s3, unsigned char* ct) { - int r = Nr >> 1; - U32 *rk = &ek[0]; - U32 t0=0, t1=0, t2=0, t3=0; - s0 ^= rk[0]; - s1 ^= rk[1]; - s2 ^= rk[2]; - s3 ^= rk[3]; - for (;;) { - t0 = - Te0[byte(s0, 3)] ^ - Te1[byte(s1, 2)] ^ - Te2[byte(s2, 1)] ^ - Te3[byte(s3, 0)] ^ - rk[4]; - t1 = - Te0[byte(s1, 3)] ^ - Te1[byte(s2, 2)] ^ - Te2[byte(s3, 1)] ^ - Te3[byte(s0, 0)] ^ - rk[5]; - t2 = - Te0[byte(s2, 3)] ^ - Te1[byte(s3, 2)] ^ - Te2[byte(s0, 1)] ^ - Te3[byte(s1, 0)] ^ - rk[6]; - t3 = - Te0[byte(s3, 3)] ^ - Te1[byte(s0, 2)] ^ - Te2[byte(s1, 1)] ^ - Te3[byte(s2, 0)] ^ - rk[7]; - - rk += 8; - if (--r == 0) { - break; - } - - s0 = - Te0[byte(t0, 3)] ^ - Te1[byte(t1, 2)] ^ - Te2[byte(t2, 1)] ^ - Te3[byte(t3, 0)] ^ - rk[0]; - s1 = - Te0[byte(t1, 3)] ^ - Te1[byte(t2, 2)] ^ - Te2[byte(t3, 1)] ^ - Te3[byte(t0, 0)] ^ - rk[1]; - s2 = - Te0[byte(t2, 3)] ^ - Te1[byte(t3, 2)] ^ - Te2[byte(t0, 1)] ^ - Te3[byte(t1, 0)] ^ - rk[2]; - s3 = - Te0[byte(t3, 3)] ^ - Te1[byte(t0, 2)] ^ - Te2[byte(t1, 1)] ^ - Te3[byte(t2, 0)] ^ - rk[3]; - } - - // apply last round and map cipher state to byte array block: - s0 = - (Te4_3[byte(t0, 3)]) ^ - (Te4_2[byte(t1, 2)]) ^ - (Te4_1[byte(t2, 1)]) ^ - (Te4_0[byte(t3, 0)]) ^ - rk[0]; - STORE32H(s0, ct); - s1 = - (Te4_3[byte(t1, 3)]) ^ - (Te4_2[byte(t2, 2)]) ^ - (Te4_1[byte(t3, 1)]) ^ - (Te4_0[byte(t0, 0)]) ^ - rk[1]; - STORE32H(s1, ct+4); - s2 = - (Te4_3[byte(t2, 3)]) ^ - (Te4_2[byte(t3, 2)]) ^ - (Te4_1[byte(t0, 1)]) ^ - (Te4_0[byte(t1, 0)]) ^ - rk[2]; - STORE32H(s2, ct+8); - s3 = - (Te4_3[byte(t3, 3)]) ^ - (Te4_2[byte(t0, 2)]) ^ - (Te4_1[byte(t1, 1)]) ^ - (Te4_0[byte(t2, 0)]) ^ - rk[3]; - STORE32H(s3, ct+12); -} - -// Encrypt or decrypt slice buf[0..n-1] at offset by XOR with AES(i) where -// i is the 128 bit big-endian distance from the start in 16 byte blocks. -void AES_CTR::encrypt(char* buf, int n, U64 offset) { - for (U64 i=offset/16; i<=(offset+n)/16; ++i) { - unsigned char ct[16]; - encrypt(iv0, iv1, i>>32, i, ct); - for (int j=0; j<16; ++j) { - const int k=i*16-offset+j; - if (k>=0 && k=0; j-=8) sha256.put(i>>j); - memcpy(b, sha256.result(), 32); - for (int j=0; j>(32-b))) - x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); - x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); - x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); - x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); - x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); - x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); - x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); - x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); - x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); - x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); - x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); - x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); - x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); - x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); - x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); - x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); - #undef R - } - for (int i=0; i<16; ++i) b[i]+=x[i]; -} - -// BlockMix_{Salsa20/8, r} on b[0..128*r-1] -static void blockmix(U32* b, int r) { - assert(r<=8); - U32 x[16]; - U32 y[256]; - memcpy(x, b+32*r-16, 64); - for (int i=0; i<2*r; ++i) { - for (int j=0; j<16; ++j) x[j]^=b[i*16+j]; - salsa8(x); - memcpy(&y[i*16], x, 64); - } - for (int i=0; i x(32*r), v(32*r*n); - for (int i=0; i>(i%4*8); -} - -// Strengthen password pw[0..pwlen-1] and salt[0..saltlen-1] -// to produce key buf[0..buflen-1]. Uses O(n*r*p) time and 128*r*n bytes -// of memory. n must be a power of 2 and r <= 8. -void scrypt(const char* pw, int pwlen, - const char* salt, int saltlen, - int n, int r, int p, char* buf, int buflen) { - assert(r<=8); - assert(n>0 && (n&(n-1))==0); // power of 2? - libzpaq::Array b(p*r*128); - pbkdf2(pw, pwlen, salt, saltlen, 1, &b[0], p*r*128); - for (int i=0; i=1 && (buf[0]=='z' || buf[0]=='7')) - buf[0]^=0x80; -} - -//////////////////////////// Component /////////////////////// - -// A Component is a context model, indirect context model, match model, -// fixed weight mixer, adaptive 2 input mixer without or with current -// partial byte as context, adaptive m input mixer (without or with), -// or SSE (without or with). - -const int compsize[256]={0,2,3,2,3,4,6,6,3,5}; - -void Component::init() { - limit=cxt=a=b=c=0; - cm.resize(0); - ht.resize(0); - a16.resize(0); -} - -////////////////////////// StateTable //////////////////////// - -// sns[i*4] -> next state if 0, next state if 1, n0, n1 -static const U8 sns[1024]={ - 1, 2, 0, 0, 3, 5, 1, 0, - 4, 6, 0, 1, 7, 9, 2, 0, - 8, 11, 1, 1, 8, 11, 1, 1, - 10, 12, 0, 2, 13, 15, 3, 0, - 14, 17, 2, 1, 14, 17, 2, 1, - 16, 19, 1, 2, 16, 19, 1, 2, - 18, 20, 0, 3, 21, 23, 4, 0, - 22, 25, 3, 1, 22, 25, 3, 1, - 24, 27, 2, 2, 24, 27, 2, 2, - 26, 29, 1, 3, 26, 29, 1, 3, - 28, 30, 0, 4, 31, 33, 5, 0, - 32, 35, 4, 1, 32, 35, 4, 1, - 34, 37, 3, 2, 34, 37, 3, 2, - 36, 39, 2, 3, 36, 39, 2, 3, - 38, 41, 1, 4, 38, 41, 1, 4, - 40, 42, 0, 5, 43, 33, 6, 0, - 44, 47, 5, 1, 44, 47, 5, 1, - 46, 49, 4, 2, 46, 49, 4, 2, - 48, 51, 3, 3, 48, 51, 3, 3, - 50, 53, 2, 4, 50, 53, 2, 4, - 52, 55, 1, 5, 52, 55, 1, 5, - 40, 56, 0, 6, 57, 45, 7, 0, - 58, 47, 6, 1, 58, 47, 6, 1, - 60, 63, 5, 2, 60, 63, 5, 2, - 62, 65, 4, 3, 62, 65, 4, 3, - 64, 67, 3, 4, 64, 67, 3, 4, - 66, 69, 2, 5, 66, 69, 2, 5, - 52, 71, 1, 6, 52, 71, 1, 6, - 54, 72, 0, 7, 73, 59, 8, 0, - 74, 61, 7, 1, 74, 61, 7, 1, - 76, 63, 6, 2, 76, 63, 6, 2, - 78, 81, 5, 3, 78, 81, 5, 3, - 80, 83, 4, 4, 80, 83, 4, 4, - 82, 85, 3, 5, 82, 85, 3, 5, - 66, 87, 2, 6, 66, 87, 2, 6, - 68, 89, 1, 7, 68, 89, 1, 7, - 70, 90, 0, 8, 91, 59, 9, 0, - 92, 77, 8, 1, 92, 77, 8, 1, - 94, 79, 7, 2, 94, 79, 7, 2, - 96, 81, 6, 3, 96, 81, 6, 3, - 98, 101, 5, 4, 98, 101, 5, 4, - 100, 103, 4, 5, 100, 103, 4, 5, - 82, 105, 3, 6, 82, 105, 3, 6, - 84, 107, 2, 7, 84, 107, 2, 7, - 86, 109, 1, 8, 86, 109, 1, 8, - 70, 110, 0, 9, 111, 59, 10, 0, - 112, 77, 9, 1, 112, 77, 9, 1, - 114, 97, 8, 2, 114, 97, 8, 2, - 116, 99, 7, 3, 116, 99, 7, 3, - 62, 101, 6, 4, 62, 101, 6, 4, - 80, 83, 5, 5, 80, 83, 5, 5, - 100, 67, 4, 6, 100, 67, 4, 6, - 102, 119, 3, 7, 102, 119, 3, 7, - 104, 121, 2, 8, 104, 121, 2, 8, - 86, 123, 1, 9, 86, 123, 1, 9, - 70, 124, 0, 10, 125, 59, 11, 0, - 126, 77, 10, 1, 126, 77, 10, 1, - 128, 97, 9, 2, 128, 97, 9, 2, - 60, 63, 8, 3, 60, 63, 8, 3, - 66, 69, 3, 8, 66, 69, 3, 8, - 104, 131, 2, 9, 104, 131, 2, 9, - 86, 133, 1, 10, 86, 133, 1, 10, - 70, 134, 0, 11, 135, 59, 12, 0, - 136, 77, 11, 1, 136, 77, 11, 1, - 138, 97, 10, 2, 138, 97, 10, 2, - 104, 141, 2, 10, 104, 141, 2, 10, - 86, 143, 1, 11, 86, 143, 1, 11, - 70, 144, 0, 12, 145, 59, 13, 0, - 146, 77, 12, 1, 146, 77, 12, 1, - 148, 97, 11, 2, 148, 97, 11, 2, - 104, 151, 2, 11, 104, 151, 2, 11, - 86, 153, 1, 12, 86, 153, 1, 12, - 70, 154, 0, 13, 155, 59, 14, 0, - 156, 77, 13, 1, 156, 77, 13, 1, - 158, 97, 12, 2, 158, 97, 12, 2, - 104, 161, 2, 12, 104, 161, 2, 12, - 86, 163, 1, 13, 86, 163, 1, 13, - 70, 164, 0, 14, 165, 59, 15, 0, - 166, 77, 14, 1, 166, 77, 14, 1, - 168, 97, 13, 2, 168, 97, 13, 2, - 104, 171, 2, 13, 104, 171, 2, 13, - 86, 173, 1, 14, 86, 173, 1, 14, - 70, 174, 0, 15, 175, 59, 16, 0, - 176, 77, 15, 1, 176, 77, 15, 1, - 178, 97, 14, 2, 178, 97, 14, 2, - 104, 181, 2, 14, 104, 181, 2, 14, - 86, 183, 1, 15, 86, 183, 1, 15, - 70, 184, 0, 16, 185, 59, 17, 0, - 186, 77, 16, 1, 186, 77, 16, 1, - 74, 97, 15, 2, 74, 97, 15, 2, - 104, 89, 2, 15, 104, 89, 2, 15, - 86, 187, 1, 16, 86, 187, 1, 16, - 70, 188, 0, 17, 189, 59, 18, 0, - 190, 77, 17, 1, 86, 191, 1, 17, - 70, 192, 0, 18, 193, 59, 19, 0, - 194, 77, 18, 1, 86, 195, 1, 18, - 70, 196, 0, 19, 193, 59, 20, 0, - 197, 77, 19, 1, 86, 198, 1, 19, - 70, 196, 0, 20, 199, 77, 20, 1, - 86, 200, 1, 20, 201, 77, 21, 1, - 86, 202, 1, 21, 203, 77, 22, 1, - 86, 204, 1, 22, 205, 77, 23, 1, - 86, 206, 1, 23, 207, 77, 24, 1, - 86, 208, 1, 24, 209, 77, 25, 1, - 86, 210, 1, 25, 211, 77, 26, 1, - 86, 212, 1, 26, 213, 77, 27, 1, - 86, 214, 1, 27, 215, 77, 28, 1, - 86, 216, 1, 28, 217, 77, 29, 1, - 86, 218, 1, 29, 219, 77, 30, 1, - 86, 220, 1, 30, 221, 77, 31, 1, - 86, 222, 1, 31, 223, 77, 32, 1, - 86, 224, 1, 32, 225, 77, 33, 1, - 86, 226, 1, 33, 227, 77, 34, 1, - 86, 228, 1, 34, 229, 77, 35, 1, - 86, 230, 1, 35, 231, 77, 36, 1, - 86, 232, 1, 36, 233, 77, 37, 1, - 86, 234, 1, 37, 235, 77, 38, 1, - 86, 236, 1, 38, 237, 77, 39, 1, - 86, 238, 1, 39, 239, 77, 40, 1, - 86, 240, 1, 40, 241, 77, 41, 1, - 86, 242, 1, 41, 243, 77, 42, 1, - 86, 244, 1, 42, 245, 77, 43, 1, - 86, 246, 1, 43, 247, 77, 44, 1, - 86, 248, 1, 44, 249, 77, 45, 1, - 86, 250, 1, 45, 251, 77, 46, 1, - 86, 252, 1, 46, 253, 77, 47, 1, - 86, 254, 1, 47, 253, 77, 48, 1, - 86, 254, 1, 48, 0, 0, 0, 0 -}; - -// Initialize next state table ns[state*4] -> next if 0, next if 1, n0, n1 -StateTable::StateTable() { - memcpy(ns, sns, sizeof(ns)); -} - -/////////////////////////// ZPAQL ////////////////////////// - -// Write header to out2, return true if HCOMP/PCOMP section is present. -// If pp is true, then write only the postprocessor code. -bool ZPAQL::write(Writer* out2, bool pp) { - if (header.size()<=6) return false; - assert(header[0]+256*header[1]==cend-2+hend-hbegin); - assert(cend>=7); - assert(hbegin>=cend); - assert(hend>=hbegin); - assert(out2); - if (!pp) { // if not a postprocessor then write COMP - for (int i=0; iput(header[i]); - } - else { // write PCOMP size only - out2->put((hend-hbegin)&255); - out2->put((hend-hbegin)>>8); - } - for (int i=hbegin; iput(header[i]); - return true; -} - -// Read header from in2 -int ZPAQL::read(Reader* in2) { - - // Get header size and allocate - int hsize=in2->get(); - hsize+=in2->get()*256; - header.resize(hsize+300); - cend=hbegin=hend=0; - header[cend++]=hsize&255; - header[cend++]=hsize>>8; - while (cend<7) header[cend++]=in2->get(); // hh hm ph pm n - - // Read COMP - int n=header[cend-1]; - for (int i=0; iget(); // component type - if (type<0 || type>255) error("unexpected end of file"); - header[cend++]=type; // component type - int size=compsize[type]; - if (size<1) error("Invalid component type"); - if (cend+size>hsize) error("COMP overflows header"); - for (int j=1; jget(); - } - if ((header[cend++]=in2->get())!=0) error("missing COMP END"); - - // Insert a guard gap and read HCOMP - hbegin=hend=cend+128; - if (hend>hsize+129) error("missing HCOMP"); - while (hendget(); - if (op==-1) error("unexpected end of file"); - header[hend++]=op; - } - if ((header[hend++]=in2->get())!=0) error("missing HCOMP END"); - assert(cend>=7 && cendhbegin && hend6); - assert(output==0); - assert(sha1==0); - init(header[2], header[3]); // hh, hm -} - -// Initialize machine state as PCOMP -void ZPAQL::initp() { - assert(header.isize()>6); - init(header[4], header[5]); // ph, pm -} - -// Flush pending output -void ZPAQL::flush() { - if (output) output->write(&outbuf[0], bufptr); - if (sha1) sha1->write(&outbuf[0], bufptr); - bufptr=0; -} - -// pow(2, x) -static double pow2(int x) { - double r=1; - for (; x>0; x--) r+=r; - return r; -} - -// Return memory requirement in bytes -double ZPAQL::memory() { - double mem=pow2(header[2]+2)+pow2(header[3]) // hh hm - +pow2(header[4]+2)+pow2(header[5]) // ph pm - +header.size(); - int cp=7; // start of comp list - for (int i=0; i0); - assert(cend>=7); - assert(hbegin>=cend+128); - assert(hend>=hbegin); - assert(hend0); - if (hbits>32) error("H too big"); - if (mbits>32) error("M too big"); - h.resize(1, hbits); - m.resize(1, mbits); - r.resize(256); - a=b=c=d=pc=f=0; -} - -// Run program on input by interpreting header -void ZPAQL::run0(U32 input) { - assert(cend>6); - assert(hbegin>=cend+128); - assert(hend>=hbegin); - assert(hend0); - assert(h.size()>0); - assert(header[0]+256*header[1]==cend+hend-hbegin-2); - pc=hbegin; - a=input; - while (execute()) ; -} - -// Execute one instruction, return 0 after HALT else 1 -int ZPAQL::execute() { - switch(header[pc++]) { - case 0: err(); break; // ERROR - case 1: ++a; break; // A++ - case 2: --a; break; // A-- - case 3: a = ~a; break; // A! - case 4: a = 0; break; // A=0 - case 7: a = r[header[pc++]]; break; // A=R N - case 8: swap(b); break; // B<>A - case 9: ++b; break; // B++ - case 10: --b; break; // B-- - case 11: b = ~b; break; // B! - case 12: b = 0; break; // B=0 - case 15: b = r[header[pc++]]; break; // B=R N - case 16: swap(c); break; // C<>A - case 17: ++c; break; // C++ - case 18: --c; break; // C-- - case 19: c = ~c; break; // C! - case 20: c = 0; break; // C=0 - case 23: c = r[header[pc++]]; break; // C=R N - case 24: swap(d); break; // D<>A - case 25: ++d; break; // D++ - case 26: --d; break; // D-- - case 27: d = ~d; break; // D! - case 28: d = 0; break; // D=0 - case 31: d = r[header[pc++]]; break; // D=R N - case 32: swap(m(b)); break; // *B<>A - case 33: ++m(b); break; // *B++ - case 34: --m(b); break; // *B-- - case 35: m(b) = ~m(b); break; // *B! - case 36: m(b) = 0; break; // *B=0 - case 39: if (f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JT N - case 40: swap(m(c)); break; // *C<>A - case 41: ++m(c); break; // *C++ - case 42: --m(c); break; // *C-- - case 43: m(c) = ~m(c); break; // *C! - case 44: m(c) = 0; break; // *C=0 - case 47: if (!f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JF N - case 48: swap(h(d)); break; // *D<>A - case 49: ++h(d); break; // *D++ - case 50: --h(d); break; // *D-- - case 51: h(d) = ~h(d); break; // *D! - case 52: h(d) = 0; break; // *D=0 - case 55: r[header[pc++]] = a; break; // R=A N - case 56: return 0 ; // HALT - case 57: outc(a&255); break; // OUT - case 59: a = (a+m(b)+512)*773; break; // HASH - case 60: h(d) = (h(d)+a+512)*773; break; // HASHD - case 63: pc+=((header[pc]+128)&255)-127; break; // JMP N - case 64: break; // A=A - case 65: a = b; break; // A=B - case 66: a = c; break; // A=C - case 67: a = d; break; // A=D - case 68: a = m(b); break; // A=*B - case 69: a = m(c); break; // A=*C - case 70: a = h(d); break; // A=*D - case 71: a = header[pc++]; break; // A= N - case 72: b = a; break; // B=A - case 73: break; // B=B - case 74: b = c; break; // B=C - case 75: b = d; break; // B=D - case 76: b = m(b); break; // B=*B - case 77: b = m(c); break; // B=*C - case 78: b = h(d); break; // B=*D - case 79: b = header[pc++]; break; // B= N - case 80: c = a; break; // C=A - case 81: c = b; break; // C=B - case 82: break; // C=C - case 83: c = d; break; // C=D - case 84: c = m(b); break; // C=*B - case 85: c = m(c); break; // C=*C - case 86: c = h(d); break; // C=*D - case 87: c = header[pc++]; break; // C= N - case 88: d = a; break; // D=A - case 89: d = b; break; // D=B - case 90: d = c; break; // D=C - case 91: break; // D=D - case 92: d = m(b); break; // D=*B - case 93: d = m(c); break; // D=*C - case 94: d = h(d); break; // D=*D - case 95: d = header[pc++]; break; // D= N - case 96: m(b) = a; break; // *B=A - case 97: m(b) = b; break; // *B=B - case 98: m(b) = c; break; // *B=C - case 99: m(b) = d; break; // *B=D - case 100: break; // *B=*B - case 101: m(b) = m(c); break; // *B=*C - case 102: m(b) = h(d); break; // *B=*D - case 103: m(b) = header[pc++]; break; // *B= N - case 104: m(c) = a; break; // *C=A - case 105: m(c) = b; break; // *C=B - case 106: m(c) = c; break; // *C=C - case 107: m(c) = d; break; // *C=D - case 108: m(c) = m(b); break; // *C=*B - case 109: break; // *C=*C - case 110: m(c) = h(d); break; // *C=*D - case 111: m(c) = header[pc++]; break; // *C= N - case 112: h(d) = a; break; // *D=A - case 113: h(d) = b; break; // *D=B - case 114: h(d) = c; break; // *D=C - case 115: h(d) = d; break; // *D=D - case 116: h(d) = m(b); break; // *D=*B - case 117: h(d) = m(c); break; // *D=*C - case 118: break; // *D=*D - case 119: h(d) = header[pc++]; break; // *D= N - case 128: a += a; break; // A+=A - case 129: a += b; break; // A+=B - case 130: a += c; break; // A+=C - case 131: a += d; break; // A+=D - case 132: a += m(b); break; // A+=*B - case 133: a += m(c); break; // A+=*C - case 134: a += h(d); break; // A+=*D - case 135: a += header[pc++]; break; // A+= N - case 136: a -= a; break; // A-=A - case 137: a -= b; break; // A-=B - case 138: a -= c; break; // A-=C - case 139: a -= d; break; // A-=D - case 140: a -= m(b); break; // A-=*B - case 141: a -= m(c); break; // A-=*C - case 142: a -= h(d); break; // A-=*D - case 143: a -= header[pc++]; break; // A-= N - case 144: a *= a; break; // A*=A - case 145: a *= b; break; // A*=B - case 146: a *= c; break; // A*=C - case 147: a *= d; break; // A*=D - case 148: a *= m(b); break; // A*=*B - case 149: a *= m(c); break; // A*=*C - case 150: a *= h(d); break; // A*=*D - case 151: a *= header[pc++]; break; // A*= N - case 152: div(a); break; // A/=A - case 153: div(b); break; // A/=B - case 154: div(c); break; // A/=C - case 155: div(d); break; // A/=D - case 156: div(m(b)); break; // A/=*B - case 157: div(m(c)); break; // A/=*C - case 158: div(h(d)); break; // A/=*D - case 159: div(header[pc++]); break; // A/= N - case 160: mod(a); break; // A%=A - case 161: mod(b); break; // A%=B - case 162: mod(c); break; // A%=C - case 163: mod(d); break; // A%=D - case 164: mod(m(b)); break; // A%=*B - case 165: mod(m(c)); break; // A%=*C - case 166: mod(h(d)); break; // A%=*D - case 167: mod(header[pc++]); break; // A%= N - case 168: a &= a; break; // A&=A - case 169: a &= b; break; // A&=B - case 170: a &= c; break; // A&=C - case 171: a &= d; break; // A&=D - case 172: a &= m(b); break; // A&=*B - case 173: a &= m(c); break; // A&=*C - case 174: a &= h(d); break; // A&=*D - case 175: a &= header[pc++]; break; // A&= N - case 176: a &= ~ a; break; // A&~A - case 177: a &= ~ b; break; // A&~B - case 178: a &= ~ c; break; // A&~C - case 179: a &= ~ d; break; // A&~D - case 180: a &= ~ m(b); break; // A&~*B - case 181: a &= ~ m(c); break; // A&~*C - case 182: a &= ~ h(d); break; // A&~*D - case 183: a &= ~ header[pc++]; break; // A&~ N - case 184: a |= a; break; // A|=A - case 185: a |= b; break; // A|=B - case 186: a |= c; break; // A|=C - case 187: a |= d; break; // A|=D - case 188: a |= m(b); break; // A|=*B - case 189: a |= m(c); break; // A|=*C - case 190: a |= h(d); break; // A|=*D - case 191: a |= header[pc++]; break; // A|= N - case 192: a ^= a; break; // A^=A - case 193: a ^= b; break; // A^=B - case 194: a ^= c; break; // A^=C - case 195: a ^= d; break; // A^=D - case 196: a ^= m(b); break; // A^=*B - case 197: a ^= m(c); break; // A^=*C - case 198: a ^= h(d); break; // A^=*D - case 199: a ^= header[pc++]; break; // A^= N - case 200: a <<= (a&31); break; // A<<=A - case 201: a <<= (b&31); break; // A<<=B - case 202: a <<= (c&31); break; // A<<=C - case 203: a <<= (d&31); break; // A<<=D - case 204: a <<= (m(b)&31); break; // A<<=*B - case 205: a <<= (m(c)&31); break; // A<<=*C - case 206: a <<= (h(d)&31); break; // A<<=*D - case 207: a <<= (header[pc++]&31); break; // A<<= N - case 208: a >>= (a&31); break; // A>>=A - case 209: a >>= (b&31); break; // A>>=B - case 210: a >>= (c&31); break; // A>>=C - case 211: a >>= (d&31); break; // A>>=D - case 212: a >>= (m(b)&31); break; // A>>=*B - case 213: a >>= (m(c)&31); break; // A>>=*C - case 214: a >>= (h(d)&31); break; // A>>=*D - case 215: a >>= (header[pc++]&31); break; // A>>= N - case 216: f = 1; break; // A==A - case 217: f = (a == b); break; // A==B - case 218: f = (a == c); break; // A==C - case 219: f = (a == d); break; // A==D - case 220: f = (a == U32(m(b))); break; // A==*B - case 221: f = (a == U32(m(c))); break; // A==*C - case 222: f = (a == h(d)); break; // A==*D - case 223: f = (a == U32(header[pc++])); break; // A== N - case 224: f = 0; break; // AA - case 233: f = (a > b); break; // A>B - case 234: f = (a > c); break; // A>C - case 235: f = (a > d); break; // A>D - case 236: f = (a > U32(m(b))); break; // A>*B - case 237: f = (a > U32(m(c))); break; // A>*C - case 238: f = (a > h(d)); break; // A>*D - case 239: f = (a > U32(header[pc++])); break; // A> N - case 255: if((pc=hbegin+header[pc]+256*header[pc+1])>=hend)err();break;//LJ - default: err(); - } - return 1; -} - -// Print illegal instruction error message and exit -void ZPAQL::err() { - error("ZPAQL execution error"); -} - -///////////////////////// Predictor ///////////////////////// - -// sdt2k[i]=2048/i; -static const int sdt2k[256]={ - 0, 2048, 1024, 682, 512, 409, 341, 292, - 256, 227, 204, 186, 170, 157, 146, 136, - 128, 120, 113, 107, 102, 97, 93, 89, - 85, 81, 78, 75, 73, 70, 68, 66, - 64, 62, 60, 58, 56, 55, 53, 52, - 51, 49, 48, 47, 46, 45, 44, 43, - 42, 41, 40, 40, 39, 38, 37, 37, - 36, 35, 35, 34, 34, 33, 33, 32, - 32, 31, 31, 30, 30, 29, 29, 28, - 28, 28, 27, 27, 26, 26, 26, 25, - 25, 25, 24, 24, 24, 24, 23, 23, - 23, 23, 22, 22, 22, 22, 21, 21, - 21, 21, 20, 20, 20, 20, 20, 19, - 19, 19, 19, 19, 18, 18, 18, 18, - 18, 18, 17, 17, 17, 17, 17, 17, - 17, 16, 16, 16, 16, 16, 16, 16, - 16, 15, 15, 15, 15, 15, 15, 15, - 15, 14, 14, 14, 14, 14, 14, 14, - 14, 14, 14, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 12, 12, - 12, 12, 12, 12, 12, 12, 12, 12, - 12, 12, 12, 11, 11, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 11, - 11, 11, 11, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8 -}; - -// sdt[i]=(1<<17)/(i*2+3)*2; -static const int sdt[1024]={ - 87380, 52428, 37448, 29126, 23830, 20164, 17476, 15420, - 13796, 12482, 11396, 10484, 9708, 9038, 8456, 7942, - 7488, 7084, 6720, 6392, 6096, 5824, 5576, 5348, - 5140, 4946, 4766, 4598, 4442, 4296, 4160, 4032, - 3912, 3798, 3692, 3590, 3494, 3404, 3318, 3236, - 3158, 3084, 3012, 2944, 2880, 2818, 2758, 2702, - 2646, 2594, 2544, 2496, 2448, 2404, 2360, 2318, - 2278, 2240, 2202, 2166, 2130, 2096, 2064, 2032, - 2000, 1970, 1940, 1912, 1884, 1858, 1832, 1806, - 1782, 1758, 1736, 1712, 1690, 1668, 1648, 1628, - 1608, 1588, 1568, 1550, 1532, 1514, 1496, 1480, - 1464, 1448, 1432, 1416, 1400, 1386, 1372, 1358, - 1344, 1330, 1316, 1304, 1290, 1278, 1266, 1254, - 1242, 1230, 1218, 1208, 1196, 1186, 1174, 1164, - 1154, 1144, 1134, 1124, 1114, 1106, 1096, 1086, - 1078, 1068, 1060, 1052, 1044, 1036, 1028, 1020, - 1012, 1004, 996, 988, 980, 974, 966, 960, - 952, 946, 938, 932, 926, 918, 912, 906, - 900, 894, 888, 882, 876, 870, 864, 858, - 852, 848, 842, 836, 832, 826, 820, 816, - 810, 806, 800, 796, 790, 786, 782, 776, - 772, 768, 764, 758, 754, 750, 746, 742, - 738, 734, 730, 726, 722, 718, 714, 710, - 706, 702, 698, 694, 690, 688, 684, 680, - 676, 672, 670, 666, 662, 660, 656, 652, - 650, 646, 644, 640, 636, 634, 630, 628, - 624, 622, 618, 616, 612, 610, 608, 604, - 602, 598, 596, 594, 590, 588, 586, 582, - 580, 578, 576, 572, 570, 568, 566, 562, - 560, 558, 556, 554, 550, 548, 546, 544, - 542, 540, 538, 536, 532, 530, 528, 526, - 524, 522, 520, 518, 516, 514, 512, 510, - 508, 506, 504, 502, 500, 498, 496, 494, - 492, 490, 488, 488, 486, 484, 482, 480, - 478, 476, 474, 474, 472, 470, 468, 466, - 464, 462, 462, 460, 458, 456, 454, 454, - 452, 450, 448, 448, 446, 444, 442, 442, - 440, 438, 436, 436, 434, 432, 430, 430, - 428, 426, 426, 424, 422, 422, 420, 418, - 418, 416, 414, 414, 412, 410, 410, 408, - 406, 406, 404, 402, 402, 400, 400, 398, - 396, 396, 394, 394, 392, 390, 390, 388, - 388, 386, 386, 384, 382, 382, 380, 380, - 378, 378, 376, 376, 374, 372, 372, 370, - 370, 368, 368, 366, 366, 364, 364, 362, - 362, 360, 360, 358, 358, 356, 356, 354, - 354, 352, 352, 350, 350, 348, 348, 348, - 346, 346, 344, 344, 342, 342, 340, 340, - 340, 338, 338, 336, 336, 334, 334, 332, - 332, 332, 330, 330, 328, 328, 328, 326, - 326, 324, 324, 324, 322, 322, 320, 320, - 320, 318, 318, 316, 316, 316, 314, 314, - 312, 312, 312, 310, 310, 310, 308, 308, - 308, 306, 306, 304, 304, 304, 302, 302, - 302, 300, 300, 300, 298, 298, 298, 296, - 296, 296, 294, 294, 294, 292, 292, 292, - 290, 290, 290, 288, 288, 288, 286, 286, - 286, 284, 284, 284, 284, 282, 282, 282, - 280, 280, 280, 278, 278, 278, 276, 276, - 276, 276, 274, 274, 274, 272, 272, 272, - 272, 270, 270, 270, 268, 268, 268, 268, - 266, 266, 266, 266, 264, 264, 264, 262, - 262, 262, 262, 260, 260, 260, 260, 258, - 258, 258, 258, 256, 256, 256, 256, 254, - 254, 254, 254, 252, 252, 252, 252, 250, - 250, 250, 250, 248, 248, 248, 248, 248, - 246, 246, 246, 246, 244, 244, 244, 244, - 242, 242, 242, 242, 242, 240, 240, 240, - 240, 238, 238, 238, 238, 238, 236, 236, - 236, 236, 234, 234, 234, 234, 234, 232, - 232, 232, 232, 232, 230, 230, 230, 230, - 230, 228, 228, 228, 228, 228, 226, 226, - 226, 226, 226, 224, 224, 224, 224, 224, - 222, 222, 222, 222, 222, 220, 220, 220, - 220, 220, 220, 218, 218, 218, 218, 218, - 216, 216, 216, 216, 216, 216, 214, 214, - 214, 214, 214, 212, 212, 212, 212, 212, - 212, 210, 210, 210, 210, 210, 210, 208, - 208, 208, 208, 208, 208, 206, 206, 206, - 206, 206, 206, 204, 204, 204, 204, 204, - 204, 204, 202, 202, 202, 202, 202, 202, - 200, 200, 200, 200, 200, 200, 198, 198, - 198, 198, 198, 198, 198, 196, 196, 196, - 196, 196, 196, 196, 194, 194, 194, 194, - 194, 194, 194, 192, 192, 192, 192, 192, - 192, 192, 190, 190, 190, 190, 190, 190, - 190, 188, 188, 188, 188, 188, 188, 188, - 186, 186, 186, 186, 186, 186, 186, 186, - 184, 184, 184, 184, 184, 184, 184, 182, - 182, 182, 182, 182, 182, 182, 182, 180, - 180, 180, 180, 180, 180, 180, 180, 178, - 178, 178, 178, 178, 178, 178, 178, 176, - 176, 176, 176, 176, 176, 176, 176, 176, - 174, 174, 174, 174, 174, 174, 174, 174, - 172, 172, 172, 172, 172, 172, 172, 172, - 172, 170, 170, 170, 170, 170, 170, 170, - 170, 170, 168, 168, 168, 168, 168, 168, - 168, 168, 168, 166, 166, 166, 166, 166, - 166, 166, 166, 166, 166, 164, 164, 164, - 164, 164, 164, 164, 164, 164, 162, 162, - 162, 162, 162, 162, 162, 162, 162, 162, - 160, 160, 160, 160, 160, 160, 160, 160, - 160, 160, 158, 158, 158, 158, 158, 158, - 158, 158, 158, 158, 158, 156, 156, 156, - 156, 156, 156, 156, 156, 156, 156, 154, - 154, 154, 154, 154, 154, 154, 154, 154, - 154, 154, 152, 152, 152, 152, 152, 152, - 152, 152, 152, 152, 152, 150, 150, 150, - 150, 150, 150, 150, 150, 150, 150, 150, - 150, 148, 148, 148, 148, 148, 148, 148, - 148, 148, 148, 148, 148, 146, 146, 146, - 146, 146, 146, 146, 146, 146, 146, 146, - 146, 144, 144, 144, 144, 144, 144, 144, - 144, 144, 144, 144, 144, 142, 142, 142, - 142, 142, 142, 142, 142, 142, 142, 142, - 142, 142, 140, 140, 140, 140, 140, 140, - 140, 140, 140, 140, 140, 140, 140, 138, - 138, 138, 138, 138, 138, 138, 138, 138, - 138, 138, 138, 138, 138, 136, 136, 136, - 136, 136, 136, 136, 136, 136, 136, 136, - 136, 136, 136, 134, 134, 134, 134, 134, - 134, 134, 134, 134, 134, 134, 134, 134, - 134, 132, 132, 132, 132, 132, 132, 132, - 132, 132, 132, 132, 132, 132, 132, 132, - 130, 130, 130, 130, 130, 130, 130, 130, - 130, 130, 130, 130, 130, 130, 130, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 126 -}; - -// ssquasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64)))); -// Middle 1344 of 4096 entries only. -static const U16 ssquasht[1344]={ - 0, 0, 0, 0, 0, 0, 0, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, - 7, 7, 7, 7, 8, 8, 8, 8, - 8, 8, 8, 8, 9, 9, 9, 9, - 9, 9, 10, 10, 10, 10, 10, 10, - 10, 11, 11, 11, 11, 11, 12, 12, - 12, 12, 12, 13, 13, 13, 13, 13, - 14, 14, 14, 14, 15, 15, 15, 15, - 15, 16, 16, 16, 17, 17, 17, 17, - 18, 18, 18, 18, 19, 19, 19, 20, - 20, 20, 21, 21, 21, 22, 22, 22, - 23, 23, 23, 24, 24, 25, 25, 25, - 26, 26, 27, 27, 28, 28, 28, 29, - 29, 30, 30, 31, 31, 32, 32, 33, - 33, 34, 34, 35, 36, 36, 37, 37, - 38, 38, 39, 40, 40, 41, 42, 42, - 43, 44, 44, 45, 46, 46, 47, 48, - 49, 49, 50, 51, 52, 53, 54, 54, - 55, 56, 57, 58, 59, 60, 61, 62, - 63, 64, 65, 66, 67, 68, 69, 70, - 71, 72, 73, 74, 76, 77, 78, 79, - 81, 82, 83, 84, 86, 87, 88, 90, - 91, 93, 94, 96, 97, 99, 100, 102, - 103, 105, 107, 108, 110, 112, 114, 115, - 117, 119, 121, 123, 125, 127, 129, 131, - 133, 135, 137, 139, 141, 144, 146, 148, - 151, 153, 155, 158, 160, 163, 165, 168, - 171, 173, 176, 179, 182, 184, 187, 190, - 193, 196, 199, 202, 206, 209, 212, 215, - 219, 222, 226, 229, 233, 237, 240, 244, - 248, 252, 256, 260, 264, 268, 272, 276, - 281, 285, 289, 294, 299, 303, 308, 313, - 318, 323, 328, 333, 338, 343, 349, 354, - 360, 365, 371, 377, 382, 388, 394, 401, - 407, 413, 420, 426, 433, 440, 446, 453, - 460, 467, 475, 482, 490, 497, 505, 513, - 521, 529, 537, 545, 554, 562, 571, 580, - 589, 598, 607, 617, 626, 636, 646, 656, - 666, 676, 686, 697, 708, 719, 730, 741, - 752, 764, 776, 788, 800, 812, 825, 837, - 850, 863, 876, 890, 903, 917, 931, 946, - 960, 975, 990, 1005, 1020, 1036, 1051, 1067, - 1084, 1100, 1117, 1134, 1151, 1169, 1186, 1204, - 1223, 1241, 1260, 1279, 1298, 1318, 1338, 1358, - 1379, 1399, 1421, 1442, 1464, 1486, 1508, 1531, - 1554, 1577, 1600, 1624, 1649, 1673, 1698, 1724, - 1749, 1775, 1802, 1829, 1856, 1883, 1911, 1940, - 1968, 1998, 2027, 2057, 2087, 2118, 2149, 2181, - 2213, 2245, 2278, 2312, 2345, 2380, 2414, 2450, - 2485, 2521, 2558, 2595, 2633, 2671, 2709, 2748, - 2788, 2828, 2869, 2910, 2952, 2994, 3037, 3080, - 3124, 3168, 3213, 3259, 3305, 3352, 3399, 3447, - 3496, 3545, 3594, 3645, 3696, 3747, 3799, 3852, - 3906, 3960, 4014, 4070, 4126, 4182, 4240, 4298, - 4356, 4416, 4476, 4537, 4598, 4660, 4723, 4786, - 4851, 4916, 4981, 5048, 5115, 5183, 5251, 5320, - 5390, 5461, 5533, 5605, 5678, 5752, 5826, 5901, - 5977, 6054, 6131, 6210, 6289, 6369, 6449, 6530, - 6613, 6695, 6779, 6863, 6949, 7035, 7121, 7209, - 7297, 7386, 7476, 7566, 7658, 7750, 7842, 7936, - 8030, 8126, 8221, 8318, 8415, 8513, 8612, 8712, - 8812, 8913, 9015, 9117, 9221, 9324, 9429, 9534, - 9640, 9747, 9854, 9962, 10071, 10180, 10290, 10401, - 10512, 10624, 10737, 10850, 10963, 11078, 11192, 11308, - 11424, 11540, 11658, 11775, 11893, 12012, 12131, 12251, - 12371, 12491, 12612, 12734, 12856, 12978, 13101, 13224, - 13347, 13471, 13595, 13719, 13844, 13969, 14095, 14220, - 14346, 14472, 14599, 14725, 14852, 14979, 15106, 15233, - 15361, 15488, 15616, 15744, 15872, 16000, 16128, 16256, - 16384, 16511, 16639, 16767, 16895, 17023, 17151, 17279, - 17406, 17534, 17661, 17788, 17915, 18042, 18168, 18295, - 18421, 18547, 18672, 18798, 18923, 19048, 19172, 19296, - 19420, 19543, 19666, 19789, 19911, 20033, 20155, 20276, - 20396, 20516, 20636, 20755, 20874, 20992, 21109, 21227, - 21343, 21459, 21575, 21689, 21804, 21917, 22030, 22143, - 22255, 22366, 22477, 22587, 22696, 22805, 22913, 23020, - 23127, 23233, 23338, 23443, 23546, 23650, 23752, 23854, - 23955, 24055, 24155, 24254, 24352, 24449, 24546, 24641, - 24737, 24831, 24925, 25017, 25109, 25201, 25291, 25381, - 25470, 25558, 25646, 25732, 25818, 25904, 25988, 26072, - 26154, 26237, 26318, 26398, 26478, 26557, 26636, 26713, - 26790, 26866, 26941, 27015, 27089, 27162, 27234, 27306, - 27377, 27447, 27516, 27584, 27652, 27719, 27786, 27851, - 27916, 27981, 28044, 28107, 28169, 28230, 28291, 28351, - 28411, 28469, 28527, 28585, 28641, 28697, 28753, 28807, - 28861, 28915, 28968, 29020, 29071, 29122, 29173, 29222, - 29271, 29320, 29368, 29415, 29462, 29508, 29554, 29599, - 29643, 29687, 29730, 29773, 29815, 29857, 29898, 29939, - 29979, 30019, 30058, 30096, 30134, 30172, 30209, 30246, - 30282, 30317, 30353, 30387, 30422, 30455, 30489, 30522, - 30554, 30586, 30618, 30649, 30680, 30710, 30740, 30769, - 30799, 30827, 30856, 30884, 30911, 30938, 30965, 30992, - 31018, 31043, 31069, 31094, 31118, 31143, 31167, 31190, - 31213, 31236, 31259, 31281, 31303, 31325, 31346, 31368, - 31388, 31409, 31429, 31449, 31469, 31488, 31507, 31526, - 31544, 31563, 31581, 31598, 31616, 31633, 31650, 31667, - 31683, 31700, 31716, 31731, 31747, 31762, 31777, 31792, - 31807, 31821, 31836, 31850, 31864, 31877, 31891, 31904, - 31917, 31930, 31942, 31955, 31967, 31979, 31991, 32003, - 32015, 32026, 32037, 32048, 32059, 32070, 32081, 32091, - 32101, 32111, 32121, 32131, 32141, 32150, 32160, 32169, - 32178, 32187, 32196, 32205, 32213, 32222, 32230, 32238, - 32246, 32254, 32262, 32270, 32277, 32285, 32292, 32300, - 32307, 32314, 32321, 32327, 32334, 32341, 32347, 32354, - 32360, 32366, 32373, 32379, 32385, 32390, 32396, 32402, - 32407, 32413, 32418, 32424, 32429, 32434, 32439, 32444, - 32449, 32454, 32459, 32464, 32468, 32473, 32478, 32482, - 32486, 32491, 32495, 32499, 32503, 32507, 32511, 32515, - 32519, 32523, 32527, 32530, 32534, 32538, 32541, 32545, - 32548, 32552, 32555, 32558, 32561, 32565, 32568, 32571, - 32574, 32577, 32580, 32583, 32585, 32588, 32591, 32594, - 32596, 32599, 32602, 32604, 32607, 32609, 32612, 32614, - 32616, 32619, 32621, 32623, 32626, 32628, 32630, 32632, - 32634, 32636, 32638, 32640, 32642, 32644, 32646, 32648, - 32650, 32652, 32653, 32655, 32657, 32659, 32660, 32662, - 32664, 32665, 32667, 32668, 32670, 32671, 32673, 32674, - 32676, 32677, 32679, 32680, 32681, 32683, 32684, 32685, - 32686, 32688, 32689, 32690, 32691, 32693, 32694, 32695, - 32696, 32697, 32698, 32699, 32700, 32701, 32702, 32703, - 32704, 32705, 32706, 32707, 32708, 32709, 32710, 32711, - 32712, 32713, 32713, 32714, 32715, 32716, 32717, 32718, - 32718, 32719, 32720, 32721, 32721, 32722, 32723, 32723, - 32724, 32725, 32725, 32726, 32727, 32727, 32728, 32729, - 32729, 32730, 32730, 32731, 32731, 32732, 32733, 32733, - 32734, 32734, 32735, 32735, 32736, 32736, 32737, 32737, - 32738, 32738, 32739, 32739, 32739, 32740, 32740, 32741, - 32741, 32742, 32742, 32742, 32743, 32743, 32744, 32744, - 32744, 32745, 32745, 32745, 32746, 32746, 32746, 32747, - 32747, 32747, 32748, 32748, 32748, 32749, 32749, 32749, - 32749, 32750, 32750, 32750, 32750, 32751, 32751, 32751, - 32752, 32752, 32752, 32752, 32752, 32753, 32753, 32753, - 32753, 32754, 32754, 32754, 32754, 32754, 32755, 32755, - 32755, 32755, 32755, 32756, 32756, 32756, 32756, 32756, - 32757, 32757, 32757, 32757, 32757, 32757, 32757, 32758, - 32758, 32758, 32758, 32758, 32758, 32759, 32759, 32759, - 32759, 32759, 32759, 32759, 32759, 32760, 32760, 32760, - 32760, 32760, 32760, 32760, 32760, 32761, 32761, 32761, - 32761, 32761, 32761, 32761, 32761, 32761, 32761, 32762, - 32762, 32762, 32762, 32762, 32762, 32762, 32762, 32762, - 32762, 32762, 32762, 32763, 32763, 32763, 32763, 32763, - 32763, 32763, 32763, 32763, 32763, 32763, 32763, 32763, - 32763, 32764, 32764, 32764, 32764, 32764, 32764, 32764, - 32764, 32764, 32764, 32764, 32764, 32764, 32764, 32764, - 32764, 32764, 32764, 32764, 32765, 32765, 32765, 32765, - 32765, 32765, 32765, 32765, 32765, 32765, 32765, 32765, - 32765, 32765, 32765, 32765, 32765, 32765, 32765, 32765, - 32765, 32765, 32765, 32765, 32765, 32765, 32766, 32766, - 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766, - 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766, - 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766, - 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766, - 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766, - 32766, 32766, 32767, 32767, 32767, 32767, 32767, 32767 -}; - -// stdt[i]=count of -i or i in botton or top of stretcht[] -static const U8 stdt[712]={ - 64, 128, 128, 128, 128, 128, 127, 128, - 127, 128, 127, 127, 127, 127, 126, 126, - 126, 126, 126, 125, 125, 124, 125, 124, - 123, 123, 123, 123, 122, 122, 121, 121, - 120, 120, 119, 119, 118, 118, 118, 116, - 117, 115, 116, 114, 114, 113, 113, 112, - 112, 111, 110, 110, 109, 108, 108, 107, - 106, 106, 105, 104, 104, 102, 103, 101, - 101, 100, 99, 98, 98, 97, 96, 96, - 94, 94, 94, 92, 92, 91, 90, 89, - 89, 88, 87, 86, 86, 84, 84, 84, - 82, 82, 81, 80, 79, 79, 78, 77, - 76, 76, 75, 74, 73, 73, 72, 71, - 70, 70, 69, 68, 67, 67, 66, 65, - 65, 64, 63, 62, 62, 61, 61, 59, - 59, 59, 57, 58, 56, 56, 55, 54, - 54, 53, 52, 52, 51, 51, 50, 49, - 49, 48, 48, 47, 47, 45, 46, 44, - 45, 43, 43, 43, 42, 41, 41, 40, - 40, 40, 39, 38, 38, 37, 37, 36, - 36, 36, 35, 34, 34, 34, 33, 32, - 33, 32, 31, 31, 30, 31, 29, 30, - 28, 29, 28, 28, 27, 27, 27, 26, - 26, 25, 26, 24, 25, 24, 24, 23, - 23, 23, 23, 22, 22, 21, 22, 21, - 20, 21, 20, 19, 20, 19, 19, 19, - 18, 18, 18, 18, 17, 17, 17, 17, - 16, 16, 16, 16, 15, 15, 15, 15, - 15, 14, 14, 14, 14, 13, 14, 13, - 13, 13, 12, 13, 12, 12, 12, 11, - 12, 11, 11, 11, 11, 11, 10, 11, - 10, 10, 10, 10, 9, 10, 9, 9, - 9, 9, 9, 8, 9, 8, 9, 8, - 8, 8, 7, 8, 8, 7, 7, 8, - 7, 7, 7, 6, 7, 7, 6, 6, - 7, 6, 6, 6, 6, 6, 6, 5, - 6, 5, 6, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 4, 5, 4, 5, - 4, 4, 5, 4, 4, 4, 4, 4, - 4, 3, 4, 4, 3, 4, 4, 3, - 3, 4, 3, 3, 3, 4, 3, 3, - 3, 3, 3, 3, 2, 3, 3, 3, - 2, 3, 2, 3, 3, 2, 2, 3, - 2, 2, 3, 2, 2, 2, 2, 3, - 2, 2, 2, 2, 2, 2, 1, 2, - 2, 2, 2, 1, 2, 2, 2, 1, - 2, 1, 2, 2, 1, 2, 1, 2, - 1, 1, 2, 1, 1, 2, 1, 1, - 2, 1, 1, 1, 1, 2, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 0, 1, 1, 1, 1, 0, - 1, 1, 1, 0, 1, 1, 1, 0, - 1, 1, 0, 1, 1, 0, 1, 0, - 1, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 0, - 1, 0, 1, 0, 0, 1, 0, 1, - 0, 0, 1, 0, 0, 1, 0, 0, - 1, 0, 0, 1, 0, 0, 0, 1, - 0, 0, 1, 0, 0, 0, 1, 0, - 0, 0, 1, 0, 0, 0, 1, 0, - 0, 0, 0, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0 -}; - -Predictor::Predictor(ZPAQL& zr): - c8(1), hmap4(1), z(zr) { - assert(sizeof(U8)==1); - assert(sizeof(U16)==2); - assert(sizeof(U32)==4); - assert(sizeof(U64)==8); - assert(sizeof(short)==2); - assert(sizeof(int)==4); - pcode=0; - pcode_size=0; - initTables=false; -} - -Predictor::~Predictor() { - allocx(pcode, pcode_size, 0); // free executable memory -} - -// Initialize the predictor with a new model in z -void Predictor::init() { - - // Clear old JIT code if any - allocx(pcode, pcode_size, 0); - - // Initialize context hash function - z.inith(); - - // Initialize model independent tables - if (!initTables && isModeled()) { - initTables=true; - memcpy(dt2k, sdt2k, sizeof(dt2k)); - memcpy(dt, sdt, sizeof(dt)); - - // ssquasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64)))); - // Copy middle 1344 of 4096 entries. - memset(squasht, 0, 1376*2); - memcpy(squasht+1376, ssquasht, 1344*2); - for (int i=2720; i<4096; ++i) squasht[i]=32767; - - // sstretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000; - int k=16384; - for (int i=0; i<712; ++i) - for (int j=stdt[i]; j>0; --j) - stretcht[k++]=i; - assert(k==32768); - for (int i=0; i<16384; ++i) - stretcht[i]=-stretcht[32767-i]; - -#ifndef NDEBUG - // Verify floating point math for squash() and stretch() - U32 sqsum=0, stsum=0; - for (int i=32767; i>=0; --i) - stsum=stsum*3+stretch(i); - for (int i=4095; i>=0; --i) - sqsum=sqsum*3+squash(i-2048); - assert(stsum==3887533746u); - assert(sqsum==2278286169u); -#endif - } - - // Initialize predictions - for (int i=0; i<256; ++i) h[i]=p[i]=0; - - // Initialize components - for (int i=0; i<256; ++i) // clear old model - comp[i].init(); - int n=z.header[6]; // hsize[0..1] hh hm ph pm n (comp)[n] END 0[128] (hcomp) END - const U8* cp=&z.header[7]; // start of component list - for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); - Component& cr=comp[i]; - switch(cp[0]) { - case CONS: // c - p[i]=(cp[1]-128)*4; - break; - case CM: // sizebits limit - if (cp[1]>32) error("max size for CM is 32"); - cr.cm.resize(1, cp[1]); // packed CM (22 bits) + CMCOUNT (10 bits) - cr.limit=cp[2]*4; - for (size_t j=0; j26) error("max size for ICM is 26"); - cr.limit=1023; - cr.cm.resize(256); - cr.ht.resize(64, cp[1]); - for (size_t j=0; j32 || cp[2]>32) error("max size for MATCH is 32 32"); - cr.cm.resize(1, cp[1]); // index - cr.ht.resize(1, cp[2]); // buf - cr.ht(0)=1; - break; - case AVG: // j k wt - if (cp[1]>=i) error("AVG j >= i"); - if (cp[2]>=i) error("AVG k >= i"); - break; - case MIX2: // sizebits j k rate mask - if (cp[1]>32) error("max size for MIX2 is 32"); - if (cp[3]>=i) error("MIX2 k >= i"); - if (cp[2]>=i) error("MIX2 j >= i"); - cr.c=(size_t(1)<32) error("max size for MIX is 32"); - if (cp[2]>=i) error("MIX j >= i"); - if (cp[3]<1 || cp[3]>i-cp[2]) error("MIX m not in 1..i-j"); - int m=cp[3]; // number of inputs - assert(m>=1); - cr.c=(size_t(1)<32) error("max size for ISSE is 32"); - if (cp[2]>=i) error("ISSE j >= i"); - cr.ht.resize(64, cp[1]); - cr.cm.resize(512); - for (int j=0; j<256; ++j) { - cr.cm[j*2]=1<<15; - cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)*1024); - } - break; - case SSE: // sizebits j start limit - if (cp[1]>32) error("max size for SSE is 32"); - if (cp[2]>=i) error("SSE j >= i"); - if (cp[3]>cp[4]*4) error("SSE start > limit*4"); - cr.cm.resize(32, cp[1]); - cr.limit=cp[4]*4; - for (size_t j=0; j0); - cp+=compsize[*cp]; - assert(cp>=&z.header[7] && cp<&z.header[z.cend]); - } -} - -// Return next bit prediction using interpreted COMP code -int Predictor::predict0() { - assert(initTables); - assert(c8>=1 && c8<=255); - - // Predict next bit - int n=z.header[6]; - assert(n>0 && n<=255); - const U8* cp=&z.header[7]; - assert(cp[-1]==n); - for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); - Component& cr=comp[i]; - switch(cp[0]) { - case CONS: // c - break; - case CM: // sizebits limit - cr.cxt=h[i]^hmap4; - p[i]=stretch(cr.cm(cr.cxt)>>17); - break; - case ICM: // sizebits - assert((hmap4&15)>0); - if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - cr.cxt=cr.ht[cr.c+(hmap4&15)]; - p[i]=stretch(cr.cm(cr.cxt)>>8); - break; - case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos, - // ht=buf, limit=pos - assert(cr.cm.size()==(size_t(1)<>(7-cr.cxt))&1; // predicted bit - p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); - } - break; - case AVG: // j k wt - p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8; - break; - case MIX2: { // sizebits j k rate mask - // c=size cm=wt[size] cxt=input - cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); - assert(cr.cxt=0 && w<65536); - p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; - assert(p[i]>=-2048 && p[i]<2048); - } - break; - case MIX: { // sizebits j m rate mask - // c=size cm=wt[size][m] cxt=index of wt in cm - int m=cp[3]; - assert(m>=1 && m<=i); - cr.cxt=h[i]+(c8&cp[5]); - cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights - assert(cr.cxt<=cr.cm.size()-m); - int* wt=(int*)&cr.cm[cr.cxt]; - p[i]=0; - for (int j=0; j>8)*p[cp[2]+j]; - p[i]=clamp2k(p[i]>>8); - } - break; - case ISSE: { // sizebits j -- c=hi, cxt=bh - assert((hmap4&15)>0); - if (c8==1 || (c8&0xf0)==16) - cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history - int *wt=(int*)&cr.cm[cr.cxt*2]; - p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); - } - break; - case SSE: { // sizebits j start limit - cr.cxt=(h[i]+c8)*32; - int pq=p[cp[2]]+992; - if (pq<0) pq=0; - if (pq>1983) pq=1983; - int wt=pq&63; - pq>>=6; - assert(pq>=0 && pq<=30); - cr.cxt+=pq; - p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)+(cr.cm(cr.cxt+1)>>10)*wt)>>13); - cr.cxt+=wt>>5; - } - break; - default: - error("component predict not implemented"); - } - cp+=compsize[cp[0]]; - assert(cp<&z.header[z.cend]); - assert(p[i]>=-2048 && p[i]<2048); - } - assert(cp[0]==NONE); - return squash(p[n-1]); -} - -// Update model with decoded bit y (0...1) -void Predictor::update0(int y) { - assert(initTables); - assert(y==0 || y==1); - assert(c8>=1 && c8<=255); - assert(hmap4>=1 && hmap4<=511); - - // Update components - const U8* cp=&z.header[7]; - int n=z.header[6]; - assert(n>=1 && n<=255); - assert(cp[-1]==n); - for (int i=0; i>8))>>2; - } - break; - case MATCH: // sizebits bufbits: - // a=len, b=offset, c=bit, cm=index, cxt=bitpos - // ht=buf, limit=pos - { - assert(cr.a<=255); - assert(cr.c==0 || cr.c==1); - assert(cr.cxt<8); - assert(cr.cm.size()==(size_t(1)<>5; - int w=cr.a16[cr.cxt]; - w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; - if (w<0) w=0; - if (w>65535) w=65535; - cr.a16[cr.cxt]=w; - } - break; - case MIX: { // sizebits j m rate mask - // cm=wt[size][m], cxt=input - int m=cp[3]; - assert(m>0 && m<=i); - assert(cr.cm.size()==m*cr.c); - assert(cr.cxt+m<=cr.cm.size()); - int err=(y*32767-squash(p[i]))*cp[4]>>4; - int* wt=(int*)&cr.cm[cr.cxt]; - for (int j=0; j>13)); - } - break; - case ISSE: { // sizebits j -- c=hi, cxt=bh - assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); - int err=y*32767-squash(p[i]); - int *wt=(int*)&cr.cm[cr.cxt*2]; - wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); - wt[1]=clamp512k(wt[1]+((err+16)>>5)); - cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); - } - break; - case SSE: // sizebits j start limit - train(cr, y); - break; - default: - assert(0); - } - cp+=compsize[cp[0]]; - assert(cp>=&z.header[7] && cp<&z.header[z.cend] - && cp<&z.header[z.header.isize()-8]); - } - assert(cp[0]==NONE); - - // Save bit y in c8, hmap4 - c8+=c8+y; - if (c8>=256) { - z.run(c8-256); - hmap4=1; - c8=1; - for (int i=0; i=16 && c8<32) - hmap4=(hmap4&0xf)<<5|y<<4|1; - else - hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); -} - -// Find cxt row in hash table ht. ht has rows of 16 indexed by the -// low sizebits of cxt with element 0 having the next higher 8 bits for -// collision detection. If not found after 3 adjacent tries, replace the -// row with lowest element 1 as priority. Return index of row. -size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { - assert(initTables); - assert(ht.size()==size_t(16)<>sizebits&255; - size_t h0=(cxt*16)&(ht.size()-16); - if (ht[h0]==chk) return h0; - size_t h1=h0^16; - if (ht[h1]==chk) return h1; - size_t h2=h0^32; - if (ht[h2]==chk) return h2; - if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) - return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; - else if (ht[h1+1]=0 && p<65536); - assert(high>low && low>0); - if (currhigh) error("archive corrupted"); - assert(curr>=low && curr<=high); - U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range - assert(high>mid && mid>=low); - int y; - if (curr<=mid) y=1, high=mid; // pick half - else y=0, low=mid+1; - while ((high^low)<0x1000000) { // shift out identical leading bytes - high=high<<8|255; - low=low<<8; - low+=(low==0); - int c=get(); - if (c<0) error("unexpected end of file"); - curr=curr<<8|c; - } - return y; -} - -// Decompress 1 byte or -1 at end of input -int Decoder::decompress() { - if (pr.isModeled()) { // n>0 components? - if (curr==0) { // segment initialization - for (int i=0; i<4; ++i) - curr=curr<<8|get(); - } - if (decode(0)) { - if (curr!=0) error("decoding end of stream"); - return -1; - } - else { - int c=1; - while (c<256) { // get 8 bits - int p=pr.predict()*2+1; - c+=c+decode(p); - pr.update(c&1); - } - return c-256; - } - } - else { - if (curr==0) { - for (int i=0; i<4; ++i) curr=curr<<8|get(); - if (curr==0) return -1; - } - --curr; - return get(); - } -} - -// Find end of compressed data and return next byte -int Decoder::skip() { - int c=-1; - if (pr.isModeled()) { - while (curr==0) // at start? - curr=get(); - while (curr && (c=get())>=0) // find 4 zeros - curr=curr<<8|c; - while ((c=get())==0) ; // might be more than 4 - return c; - } - else { - if (curr==0) // at start? - for (int i=0; i<4 && (c=get())>=0; ++i) curr=curr<<8|c; - while (curr>0) { - while (curr>0) { - --curr; - if (get()<0) return error("skipped to EOF"), -1; - } - for (int i=0; i<4 && (c=get())>=0; ++i) curr=curr<<8|c; - } - if (c>=0) c=get(); - return c; - } -} - -////////////////////// PostProcessor ////////////////////// - -// Copy ph, pm from block header -void PostProcessor::init(int h, int m) { - state=hsize=0; - ph=h; - pm=m; - z.clear(); -} - -// (PASS=0 | PROG=1 psize[0..1] pcomp[0..psize-1]) data... EOB=-1 -// Return state: 1=PASS, 2..4=loading PROG, 5=PROG loaded -int PostProcessor::write(int c) { - assert(c>=-1 && c<=255); - switch (state) { - case 0: // initial state - if (c<0) error("Unexpected EOS"); - state=c+1; // 1=PASS, 2=PROG - if (state>2) error("unknown post processing type"); - if (state==1) z.clear(); - break; - case 1: // PASS - z.outc(c); - break; - case 2: // PROG - if (c<0) error("Unexpected EOS"); - hsize=c; // low byte of size - state=3; - break; - case 3: // PROG psize[0] - if (c<0) error("Unexpected EOS"); - hsize+=c*256; // high byte of psize - if (hsize<1) error("Empty PCOMP"); - z.header.resize(hsize+300); - z.cend=8; - z.hbegin=z.hend=z.cend+128; - z.header[4]=ph; - z.header[5]=pm; - state=4; - break; - case 4: // PROG psize[0..1] pcomp[0...] - if (c<0) error("Unexpected EOS"); - assert(z.hend>8; - z.initp(); - state=5; - } - break; - case 5: // PROG ... data - z.run(c); - if (c<0) z.flush(); - break; - } - return state; -} - -/////////////////////// Decompresser ///////////////////// - -// Find the start of a block and return true if found. Set memptr -// to memory used. -bool Decompresser::findBlock(double* memptr) { - assert(state==BLOCK); - - // Find start of block - U32 h1=0x3D49B113, h2=0x29EB7F93, h3=0x2614BE13, h4=0x3828EB13; - // Rolling hashes initialized to hash of first 13 bytes - int c; - while ((c=dec.get())!=-1) { - h1=h1*12+c; - h2=h2*20+c; - h3=h3*28+c; - h4=h4*44+c; - if (h1==0xB16B88F1 && h2==0xFF5376F1 && h3==0x72AC5BF1 && h4==0x2F909AF1) - break; // hash of 16 byte string - } - if (c==-1) return false; - - // Read header - if ((c=dec.get())!=1 && c!=2) error("unsupported ZPAQ level"); - if (dec.get()!=1) error("unsupported ZPAQL type"); - z.read(&dec); - if (c==1 && z.header.isize()>6 && z.header[6]==0) - error("ZPAQ level 1 requires at least 1 component"); - if (memptr) *memptr=z.memory(); - state=FILENAME; - decode_state=FIRSTSEG; - return true; -} - -// Read the start of a segment (1) or end of block code (255). -// If a segment is found, write the filename and return true, else false. -bool Decompresser::findFilename(Writer* filename) { - assert(state==FILENAME); - int c=dec.get(); - if (c==1) { // segment found - while (true) { - c=dec.get(); - if (c==-1) error("unexpected EOF"); - if (c==0) { - state=COMMENT; - return true; - } - if (filename) filename->put(c); - } - } - else if (c==255) { // end of block found - state=BLOCK; - return false; - } - else - error("missing segment or end of block"); - return false; -} - -// Read the comment from the segment header -void Decompresser::readComment(Writer* comment) { - assert(state==COMMENT); - state=DATA; - while (true) { - int c=dec.get(); - if (c==-1) error("unexpected EOF"); - if (c==0) break; - if (comment) comment->put(c); - } - if (dec.get()!=0) error("missing reserved byte"); -} - -// Decompress n bytes, or all if n < 0. Return false if done -bool Decompresser::decompress(int n) { - assert(state==DATA); - if (decode_state==SKIP) error("decompression after skipped segment"); - assert(decode_state!=SKIP); - - // Initialize models to start decompressing block - if (decode_state==FIRSTSEG) { - dec.init(); - assert(z.header.size()>5); - pp.init(z.header[4], z.header[5]); - decode_state=SEG; - } - - // Decompress and load PCOMP into postprocessor - while ((pp.getState()&3)!=1) - pp.write(dec.decompress()); - - // Decompress n bytes, or all if n < 0 - while (n) { - int c=dec.decompress(); - pp.write(c); - if (c==-1) { - state=SEGEND; - return false; - } - if (n>0) --n; - } - return true; -} - -// Read end of block. If a SHA1 checksum is present, write 1 and the -// 20 byte checksum into sha1string, else write 0 in first byte. -// If sha1string is 0 then discard it. -void Decompresser::readSegmentEnd(char* sha1string) { - assert(state==DATA || state==SEGEND); - - // Skip remaining data if any and get next byte - int c=0; - if (state==DATA) { - c=dec.skip(); - decode_state=SKIP; - } - else if (state==SEGEND) - c=dec.get(); - state=FILENAME; - - // Read checksum - if (c==254) { - if (sha1string) sha1string[0]=0; // no checksum - } - else if (c==253) { - if (sha1string) sha1string[0]=1; - for (int i=1; i<=20; ++i) { - c=dec.get(); - if (sha1string) sha1string[i]=c; - } - } - else - error("missing end of segment marker"); -} - -/////////////////////////// decompress() ////////////////////// - -void decompress(Reader* in, Writer* out) { - Decompresser d; - d.setInput(in); - d.setOutput(out); - while (d.findBlock()) { // don't calculate memory - while (d.findFilename()) { // discard filename - d.readComment(); // discard comment - d.decompress(); // to end of segment - d.readSegmentEnd(); // discard sha1string - } - } -} - -/////////////////////////// Encoder /////////////////////////// - -// Initialize for start of block -void Encoder::init() { - low=1; - high=0xFFFFFFFF; - pr.init(); - if (!pr.isModeled()) low=0, buf.resize(1<<16); -} - -// compress bit y having probability p/64K -void Encoder::encode(int y, int p) { - assert(out); - assert(p>=0 && p<65536); - assert(y==0 || y==1); - assert(high>low && low>0); - U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range - assert(high>mid && mid>=low); - if (y) high=mid; else low=mid+1; // pick half - while ((high^low)<0x1000000) { // write identical leading bytes - out->put(high>>24); // same as low>>24 - high=high<<8|255; - low=low<<8; - low+=(low==0); // so we don't code 4 0 bytes in a row - } -} - -// compress byte c (0..255 or -1=EOS) -void Encoder::compress(int c) { - assert(out); - if (pr.isModeled()) { - if (c==-1) - encode(1, 0); - else { - assert(c>=0 && c<=255); - encode(0, 0); - for (int i=7; i>=0; --i) { - int p=pr.predict()*2+1; - assert(p>0 && p<65536); - int y=c>>i&1; - encode(y, p); - pr.update(y); - } - } - } - else { - if (low && (c<0 || low==buf.size())) { - out->put((low>>24)&255); - out->put((low>>16)&255); - out->put((low>>8)&255); - out->put(low&255); - out->write(&buf[0], low); - low=0; - } - if (c>=0) buf[low++]=c; - } -} - -//////////////////////////// Compiler ///////////////////////// - -// Component names -const char* compname[256]= - {"","const","cm","icm","match","avg","mix2","mix","isse","sse",0}; - -// Opcodes -const char* opcodelist[272]={ -"error","a++", "a--", "a!", "a=0", "", "", "a=r", -"b<>a", "b++", "b--", "b!", "b=0", "", "", "b=r", -"c<>a", "c++", "c--", "c!", "c=0", "", "", "c=r", -"d<>a", "d++", "d--", "d!", "d=0", "", "", "d=r", -"*b<>a","*b++", "*b--", "*b!", "*b=0", "", "", "jt", -"*c<>a","*c++", "*c--", "*c!", "*c=0", "", "", "jf", -"*d<>a","*d++", "*d--", "*d!", "*d=0", "", "", "r=a", -"halt", "out", "", "hash", "hashd","", "", "jmp", -"a=a", "a=b", "a=c", "a=d", "a=*b", "a=*c", "a=*d", "a=", -"b=a", "b=b", "b=c", "b=d", "b=*b", "b=*c", "b=*d", "b=", -"c=a", "c=b", "c=c", "c=d", "c=*b", "c=*c", "c=*d", "c=", -"d=a", "d=b", "d=c", "d=d", "d=*b", "d=*c", "d=*d", "d=", -"*b=a", "*b=b", "*b=c", "*b=d", "*b=*b","*b=*c","*b=*d","*b=", -"*c=a", "*c=b", "*c=c", "*c=d", "*c=*b","*c=*c","*c=*d","*c=", -"*d=a", "*d=b", "*d=c", "*d=d", "*d=*b","*d=*c","*d=*d","*d=", -"", "", "", "", "", "", "", "", -"a+=a", "a+=b", "a+=c", "a+=d", "a+=*b","a+=*c","a+=*d","a+=", -"a-=a", "a-=b", "a-=c", "a-=d", "a-=*b","a-=*c","a-=*d","a-=", -"a*=a", "a*=b", "a*=c", "a*=d", "a*=*b","a*=*c","a*=*d","a*=", -"a/=a", "a/=b", "a/=c", "a/=d", "a/=*b","a/=*c","a/=*d","a/=", -"a%=a", "a%=b", "a%=c", "a%=d", "a%=*b","a%=*c","a%=*d","a%=", -"a&=a", "a&=b", "a&=c", "a&=d", "a&=*b","a&=*c","a&=*d","a&=", -"a&~a", "a&~b", "a&~c", "a&~d", "a&~*b","a&~*c","a&~*d","a&~", -"a|=a", "a|=b", "a|=c", "a|=d", "a|=*b","a|=*c","a|=*d","a|=", -"a^=a", "a^=b", "a^=c", "a^=d", "a^=*b","a^=*c","a^=*d","a^=", -"a<<=a","a<<=b","a<<=c","a<<=d","a<<=*b","a<<=*c","a<<=*d","a<<=", -"a>>=a","a>>=b","a>>=c","a>>=d","a>>=*b","a>>=*c","a>>=*d","a>>=", -"a==a", "a==b", "a==c", "a==d", "a==*b","a==*c","a==*d","a==", -"aa", "a>b", "a>c", "a>d", "a>*b", "a>*c", "a>*d", "a>", -"", "", "", "", "", "", "", "", -"", "", "", "", "", "", "", "lj", -"post", "pcomp","end", "if", "ifnot","else", "endif","do", -"while","until","forever","ifl","ifnotl","elsel",";", 0}; - -// Advance in to start of next token. Tokens are delimited by white -// space. Comments inclosed in ((nested) parenthsis) are skipped. -void Compiler::next() { - assert(in); - for (; *in; ++in) { - if (*in=='\n') ++line; - if (*in=='(') state+=1+(state<0); - else if (state>0 && *in==')') --state; - else if (state<0 && *in<=' ') state=0; - else if (state==0 && *in>' ') {state=-1; break;} - } - if (!*in) error("unexpected end of config"); -} - -// convert to lower case -int tolower(int c) {return (c>='A' && c<='Z') ? c+'a'-'A' : c;} - -// return true if in==word up to white space or '(', case insensitive -bool Compiler::matchToken(const char* word) { - const char* a=in; - for (; (*a>' ' && *a!='(' && *word); ++a, ++word) - if (tolower(*a)!=tolower(*word)) return false; - return !*word && (*a<=' ' || *a=='('); -} - -// Print error message and exit -void Compiler::syntaxError(const char* msg, const char* expected) { - Array sbuf(128); // error message to report - char* s=&sbuf[0]; - strcat(s, "Config line "); - for (int i=strlen(s), r=1000000; r; r/=10) // append line number - if (line/r) s[i++]='0'+line/r%10; - strcat(s, " at "); - for (int i=strlen(s); i<40 && *in>' '; ++i) // append token found - s[i]=*in++; - strcat(s, ": "); - strncat(s, msg, 40); // append message - if (expected) { - strcat(s, ", expected: "); - strncat(s, expected, 20); // append expected token if any - } - error(s); -} - -// Read a token, which must be in the NULL terminated list or else -// exit with an error. If found, return its index. -int Compiler::rtoken(const char* list[]) { - assert(in); - assert(list); - next(); - for (int i=0; list[i]; ++i) - if (matchToken(list[i])) - return i; - syntaxError("unexpected"); - assert(0); - return -1; // not reached -} - -// Read a token which must be the specified value s -void Compiler::rtoken(const char* s) { - assert(s); - next(); - if (!matchToken(s)) syntaxError("expected", s); -} - -// Read a number in (low...high) or exit with an error -// For numbers like $N+M, return arg[N-1]+M -int Compiler::rtoken(int low, int high) { - next(); - int r=0; - if (in[0]=='$' && in[1]>='1' && in[1]<='9') { - if (in[2]=='+') r=atoi(in+3); - if (args) r+=args[in[1]-'1']; - } - else if (in[0]=='-' || (in[0]>='0' && in[0]<='9')) r=atoi(in); - else syntaxError("expected a number"); - if (rhigh) syntaxError("number too high"); - return r; -} - -// Compile HCOMP or PCOMP code. Exit on error. Return -// code for end token (POST, PCOMP, END) -int Compiler::compile_comp(ZPAQL& z) { - int op=0; - const int comp_begin=z.hend; - while (true) { - op=rtoken(opcodelist); - if (op==POST || op==PCOMP || op==END) break; - int operand=-1; // 0...255 if 2 bytes - int operand2=-1; // 0...255 if 3 bytes - if (op==IF) { - op=JF; - operand=0; // set later - if_stack.push(z.hend+1); // save jump target location - } - else if (op==IFNOT) { - op=JT; - operand=0; - if_stack.push(z.hend+1); // save jump target location - } - else if (op==IFL || op==IFNOTL) { // long if - if (op==IFL) z.header[z.hend++]=(JT); - if (op==IFNOTL) z.header[z.hend++]=(JF); - z.header[z.hend++]=(3); - op=LJ; - operand=operand2=0; - if_stack.push(z.hend+1); - } - else if (op==ELSE || op==ELSEL) { - if (op==ELSE) op=JMP, operand=0; - if (op==ELSEL) op=LJ, operand=operand2=0; - int a=if_stack.pop(); // conditional jump target location - assert(a>comp_begin && a=0); - if (j>127) syntaxError("IF too big, try IFL, IFNOTL"); - z.header[a]=j; - } - else { // IFL, IFNOTL - int j=z.hend-comp_begin+2+(op==LJ); - assert(j>=0); - z.header[a]=j&255; - z.header[a+1]=(j>>8)&255; - } - if_stack.push(z.hend+1); // save JMP target location - } - else if (op==ENDIF) { - int a=if_stack.pop(); // jump target address - assert(a>comp_begin && a=0); - if (z.header[a-1]!=LJ) { - assert(z.header[a-1]==JT || z.header[a-1]==JF || z.header[a-1]==JMP); - if (j>127) syntaxError("IF too big, try IFL, IFNOTL, ELSEL\n"); - z.header[a]=j; - } - else { - assert(a+1>8)&255; - } - } - else if (op==DO) { - do_stack.push(z.hend); - } - else if (op==WHILE || op==UNTIL || op==FOREVER) { - int a=do_stack.pop(); - assert(a>=comp_begin && a=-127) { // backward short jump - if (op==WHILE) op=JT; - if (op==UNTIL) op=JF; - if (op==FOREVER) op=JMP; - operand=j&255; - } - else { // backward long jump - j=a-comp_begin; - assert(j>=0 && j>8; - } - } - else if ((op&7)==7) { // 2 byte operand, read N - if (op==LJ) { - operand=rtoken(0, 65535); - operand2=operand>>8; - operand&=255; - } - else if (op==JT || op==JF || op==JMP) { - operand=rtoken(-128, 127); - operand&=255; - } - else - operand=rtoken(0, 255); - } - if (op>=0 && op<=255) - z.header[z.hend++]=(op); - if (operand>=0) - z.header[z.hend++]=(operand); - if (operand2>=0) - z.header[z.hend++]=(operand2); - if (z.hend>=z.header.isize()-130 || z.hend-z.hbegin+z.cend-2>65535) - syntaxError("program too big"); - } - z.header[z.hend++]=(0); // END - return op; -} - -// Compile a configuration file. Store COMP/HCOMP section in hcomp. -// If there is a PCOMP section, store it in pcomp and store the PCOMP -// command in pcomp_cmd. Replace "$1..$9+n" with args[0..8]+n - -Compiler::Compiler(const char* in_, int* args_, ZPAQL& hz_, ZPAQL& pz_, - Writer* out2_): in(in_), args(args_), hz(hz_), pz(pz_), - out2(out2_), if_stack(1000), do_stack(1000) { - line=1; - state=0; - hz.clear(); - pz.clear(); - hz.header.resize(68000); - - // Compile the COMP section of header - rtoken("comp"); - hz.header[2]=rtoken(0, 255); // hh - hz.header[3]=rtoken(0, 255); // hm - hz.header[4]=rtoken(0, 255); // ph - hz.header[5]=rtoken(0, 255); // pm - const int n=hz.header[6]=rtoken(0, 255); // n - hz.cend=7; - for (int i=0; i10) syntaxError("invalid component"); - for (int j=1; j>8; - - // Compile POST 0 END - if (op==POST) { - rtoken(0, 0); - rtoken("end"); - } - - // Compile PCOMP pcomp_cmd ; program... END - else if (op==PCOMP) { - pz.header.resize(68000); - pz.header[4]=hz.header[4]; // ph - pz.header[5]=hz.header[5]; // pm - pz.cend=8; - pz.hbegin=pz.hend=pz.cend+128; - - // get pcomp_cmd ending with ";" (case sensitive) - next(); - while (*in && *in!=';') { - if (out2) - out2->put(*in); - ++in; - } - if (*in) ++in; - - // Compile PCOMP - op=compile_comp(pz); - int len=pz.cend-2+pz.hend-pz.hbegin; // insert header size - assert(len>=0); - pz.header[0]=len&255; - pz.header[1]=len>>8; - if (op!=END) - syntaxError("expected END"); - } - else if (op!=END) - syntaxError("expected END or POST 0 END or PCOMP cmd ; ... END"); -} - -///////////////////// Compressor ////////////////////// - -// Write 13 byte start tag -// "\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3" -void Compressor::writeTag() { - assert(state==INIT); - enc.out->put(0x37); - enc.out->put(0x6b); - enc.out->put(0x53); - enc.out->put(0x74); - enc.out->put(0xa0); - enc.out->put(0x31); - enc.out->put(0x83); - enc.out->put(0xd3); - enc.out->put(0x8c); - enc.out->put(0xb2); - enc.out->put(0x28); - enc.out->put(0xb0); - enc.out->put(0xd3); -} - -void Compressor::startBlock(int level) { - - // Model 1 - min.cfg - static const char models[]={ - 26,0,1,2,0,0,2,3,16,8,19,0,0,96,4,28, - 59,10,59,112,25,10,59,10,59,112,56,0, - - // Model 2 - mid.cfg - 69,0,3,3,0,0,8,3,5,8,13,0,8,17,1,8, - 18,2,8,18,3,8,19,4,4,22,24,7,16,0,7,24, - (char)-1,0,17,104,74,4,95,1,59,112,10,25,59,112,10,25, - 59,112,10,25,59,112,10,25,59,112,10,25,59,10,59,112, - 25,69,(char)-49,8,112,56,0, - - // Model 3 - max.cfg - (char)-60,0,5,9,0,0,22,1,(char)-96,3,5,8,13,1,8,16, - 2,8,18,3,8,19,4,8,19,5,8,20,6,4,22,24, - 3,17,8,19,9,3,13,3,13,3,13,3,14,7,16,0, - 15,24,(char)-1,7,8,0,16,10,(char)-1,6,0,15,16,24,0,9, - 8,17,32,(char)-1,6,8,17,18,16,(char)-1,9,16,19,32,(char)-1,6, - 0,19,20,16,0,0,17,104,74,4,95,2,59,112,10,25, - 59,112,10,25,59,112,10,25,59,112,10,25,59,112,10,25, - 59,10,59,112,10,25,59,112,10,25,69,(char)-73,32,(char)-17,64,47, - 14,(char)-25,91,47,10,25,60,26,48,(char)-122,(char)-105,20,112,63,9,70, - (char)-33,0,39,3,25,112,26,52,25,25,74,10,4,59,112,25, - 10,4,59,112,25,10,4,59,112,25,65,(char)-113,(char)-44,72,4,59, - 112,8,(char)-113,(char)-40,8,68,(char)-81,60,60,25,69,(char)-49,9,112,25,25, - 25,25,25,112,56,0, - - 0,0}; // 0,0 = end of list - - if (level<1) error("compression level must be at least 1"); - const char* p=models; - int i; - for (i=1; i6); - enc.out->put('z'); - enc.out->put('P'); - enc.out->put('Q'); - enc.out->put(1+(z.header[6]==0)); // level 1 or 2 - enc.out->put(1); - z.write(enc.out, false); - state=BLOCK1; -} - -void Compressor::startBlock(const char* config, int* args, Writer* pcomp_cmd) { - assert(state==INIT); - Compiler(config, args, z, pz, pcomp_cmd); - pz.sha1=&sha1; - assert(z.header.isize()>6); - enc.out->put('z'); - enc.out->put('P'); - enc.out->put('Q'); - enc.out->put(1+(z.header[6]==0)); // level 1 or 2 - enc.out->put(1); - z.write(enc.out, false); - state=BLOCK1; -} - -// Write a segment header -void Compressor::startSegment(const char* filename, const char* comment) { - assert(state==BLOCK1 || state==BLOCK2); - enc.out->put(1); - while (filename && *filename) - enc.out->put(*filename++); - enc.out->put(0); - while (comment && *comment) - enc.out->put(*comment++); - enc.out->put(0); - enc.out->put(0); - if (state==BLOCK1) state=SEG1; - if (state==BLOCK2) state=SEG2; -} - -// Initialize encoding and write pcomp to first segment -// If len is 0 then length is encoded in pcomp[0..1] -// if pcomp is 0 then get pcomp from pz.header -void Compressor::postProcess(const char* pcomp, int len) { - if (state==SEG2) return; - assert(state==SEG1); - enc.init(); - if (!pcomp) { - len=pz.hend-pz.hbegin; - if (len>0) { - assert(pz.header.isize()>pz.hend); - assert(pz.hbegin>=0); - pcomp=(const char*)&pz.header[pz.hbegin]; - } - assert(len>=0); - } - else if (len==0) { - len=toU16(pcomp); - pcomp+=2; - } - if (len>0) { - enc.compress(1); - enc.compress(len&255); - enc.compress((len>>8)&255); - for (int i=0; i=0 && nread(buf, nbuf); - if (nr<0 || nr>BUFSIZE || nr>nbuf) error("invalid read size"); - if (nr<=0) return false; - if (n>=0) n-=nr; - for (int i=0; iput(0); - enc.out->put(0); - enc.out->put(0); - enc.out->put(0); - if (sha1string) { - enc.out->put(253); - for (int i=0; i<20; ++i) - enc.out->put(sha1string[i]); - } - else - enc.out->put(254); - state=BLOCK2; -} - -// End segment, write checksum and size is verify is true -char* Compressor::endSegmentChecksum(int64_t* size, bool dosha1) { - if (state==SEG1) - postProcess(); - assert(state==SEG2); - enc.compress(-1); - if (verify && pz.hend) { - pz.run(-1); - pz.flush(); - } - enc.out->put(0); - enc.out->put(0); - enc.out->put(0); - enc.out->put(0); - if (verify) { - if (size) *size=sha1.usize(); - memcpy(sha1result, sha1.result(), 20); - } - if (verify && dosha1) { - enc.out->put(253); - for (int i=0; i<20; ++i) - enc.out->put(sha1result[i]); - } - else - enc.out->put(254); - state=BLOCK2; - return verify ? sha1result : 0; -} - -// End block -void Compressor::endBlock() { - assert(state==BLOCK2); - enc.out->put(255); - state=INIT; -} - -/////////////////////////// compress() /////////////////////// - -void compress(Reader* in, Writer* out, const char* method, - const char* filename, const char* comment, bool dosha1) { - - // Get block size - int bs=4; - if (method && method[0] && method[1]>='0' && method[1]<='9') { - bs=method[1]-'0'; - if (method[2]>='0' && method[2]<='9') bs=bs*10+method[2]-'0'; - if (bs>11) bs=11; - } - bs=(0x100000<read((char*)sb.data(), bs))>0) { - sb.resize(n); - compressBlock(&sb, out, method, filename, comment, dosha1); - filename=0; - comment=0; - sb.resize(0); - } -} - -//////////////////////// ZPAQL::assemble() //////////////////// - -#ifndef NOJIT -/* -assemble(); - -Assembles the ZPAQL code in hcomp[0..hlen-1] and stores x86-32 or x86-64 -code in rcode[0..rcode_size-1]. Execution begins at rcode[0]. It will not -write beyond the end of rcode, but in any case it returns the number of -bytes that would have been written. It returns 0 in case of error. - -The assembled code implements int run() and returns 0 if successful, -1 if the ZPAQL code executes an invalid instruction or jumps out of -bounds, or 2 if OUT throws bad_alloc, or 3 for other OUT exceptions. - -A ZPAQL virtual machine has the following state. All values are -unsigned and initially 0: - - a, b, c, d: 32 bit registers (pointed to by their respective parameters) - f: 1 bit flag register (pointed to) - r[0..255]: 32 bit registers - m[0..msize-1]: 8 bit registers, where msize is a power of 2 - h[0..hsize-1]: 32 bit registers, where hsize is a power of 2 - out: pointer to a Writer - sha1: pointer to a SHA1 - -Generally a ZPAQL machine is used to compute contexts which are -placed in h. A second machine might post-process, and write its -output to out and sha1. In either case, a machine is called with -its input in a, representing a single byte (0..255) or -(for a postprocessor) EOF (0xffffffff). Execution returs after a -ZPAQL halt instruction. - -ZPAQL instructions are 1 byte unless the last 3 bits are 1. -In this case, a second operand byte follows. Opcode 255 is -the only 3 byte instruction. They are organized: - - 00dddxxx = unary opcode xxx on destination ddd (ddd < 111) - 00111xxx = special instruction xxx - 01dddsss = assignment: ddd = sss (ddd < 111) - 1xxxxsss = operation xxxx from sss to a - -The meaning of sss and ddd are as follows: - - 000 = a (accumulator) - 001 = b - 010 = c - 011 = d - 100 = *b (means m[b mod msize]) - 101 = *c (means m[c mod msize]) - 110 = *d (means h[d mod hsize]) - 111 = n (constant 0..255 in second byte of instruction) - -For example, 01001110 assigns *d to b. The other instructions xxx -are as follows: - -Group 00dddxxx where ddd < 111 and xxx is: - 000 = ddd<>a, swap with a (except 00000000 is an error, and swap - with *b or *c leaves the high bits of a unchanged) - 001 = ddd++, increment - 010 = ddd--, decrement - 011 = ddd!, not (invert all bits) - 100 = ddd=0, clear (set all bits of ddd to 0) - 101 = not used (error) - 110 = not used - 111 = ddd=r n, assign from r[n] to ddd, n=0..255 in next opcode byte -Except: - 00100111 = jt n, jump if f is true (n = -128..127, relative to next opcode) - 00101111 = jf n, jump if f is false (n = -128..127) - 00110111 = r=a n, assign r[n] = a (n = 0..255) - -Group 00111xxx where xxx is: - 000 = halt (return) - 001 = output a - 010 = not used - 011 = hash: a = (a + *b + 512) * 773 - 100 = hashd: *d = (*d + a + 512) * 773 - 101 = not used - 110 = not used - 111 = unconditional jump (n = -128 to 127, relative to next opcode) - -Group 1xxxxsss where xxxx is: - 0000 = a += sss (add, subtract, multiply, divide sss to a) - 0001 = a -= sss - 0010 = a *= sss - 0011 = a /= sss (unsigned, except set a = 0 if sss is 0) - 0100 = a %= sss (remainder, except set a = 0 if sss is 0) - 0101 = a &= sss (bitwise AND) - 0110 = a &= ~sss (bitwise AND with complement of sss) - 0111 = a |= sss (bitwise OR) - 1000 = a ^= sss (bitwise XOR) - 1001 = a <<= (sss % 32) (left shift by low 5 bits of sss) - 1010 = a >>= (sss % 32) (unsigned, zero bits shifted in) - 1011 = a == sss (compare, set f = true if equal or false otherwise) - 1100 = a < sss (unsigned compare, result in f) - 1101 = a > sss (unsigned compare) - 1110 = not used - 1111 = not used except 11111111 is a 3 byte jump to the absolute address - in the next 2 bytes in little-endian (LSB first) order. - -assemble() translates ZPAQL to 32 bit x86 code to be executed by run(). -Registers are mapped as follows: - - eax = source sss from *b, *c, *d or sometimes n - ecx = pointer to destination *b, *c, *d, or spare - edx = a - ebx = f (1 for true, 0 for false) - esp = stack pointer - ebp = d - esi = b - edi = c - -run() saves non-volatile registers (ebp, esi, edi, ebx) on the stack, -loads a, b, c, d, f, and executes the translated instructions. -A halt instruction saves a, b, c, d, f, pops the saved registers -and returns. Invalid instructions or jumps outside of the range -of the ZPAQL code call libzpaq::error(). - -In 64 bit mode, the following additional registers are used: - - r12 = h - r14 = r - r15 = m - -*/ - -// Called by out -static int flush1(ZPAQL* z) { - try { - z->flush(); - return 0; - } - catch(std::bad_alloc& x) { - return 2; - } - catch(...) { - return 3; - } -} - -// return true if op is an undefined ZPAQL instruction -static bool iserr(int op) { - return op==0 || (op>=120 && op<=127) || (op>=240 && op<=254) - || op==58 || (op<64 && (op%8==5 || op%8==6)); -} - -// Return length of ZPAQL instruction at hcomp[0]. Assume 0 padding at end. -// A run of identical ++ or -- is counted as 1 instruction. -static int oplen(const U8* hcomp) { - if (*hcomp==255) return 3; - if (*hcomp%8==7) return 2; - if (*hcomp<51 && (*hcomp%8-1)/2==0) { // ++ or -- opcode - int i; - for (i=1; i<127 && hcomp[i]==hcomp[0]; ++i); - return i; - } - return 1; -} - -// Write k bytes of x to rcode[o++] MSB first -static void put(U8* rcode, int n, int& o, U32 x, int k) { - while (k-->0) { - if (o>(k*8))&255; - ++o; - } -} - -// Write 4 bytes of x to rcode[o++] LSB first -static void put4lsb(U8* rcode, int n, int& o, U32 x) { - for (int k=0; k<4; ++k) { - if (o>(k*8))&255; - ++o; - } -} - -// Write a 1-4 byte x86 opcode without or with an 4 byte operand -// to rcode[o...] -#define put1(x) put(rcode, rcode_size, o, (x), 1) -#define put2(x) put(rcode, rcode_size, o, (x), 2) -#define put3(x) put(rcode, rcode_size, o, (x), 3) -#define put4(x) put(rcode, rcode_size, o, (x), 4) -#define put5(x,y) put4(x), put1(y) -#define put6(x,y) put4(x), put2(y) -#define put4r(x) put4lsb(rcode, rcode_size, o, x) -#define puta(x) t=U32(size_t(x)), put4r(t) -#define put1a(x,y) put1(x), puta(y) -#define put2a(x,y) put2(x), puta(y) -#define put3a(x,y) put3(x), puta(y) -#define put4a(x,y) put4(x), puta(y) -#define put5a(x,y,z) put4(x), put1(y), puta(z) -#define put2l(x,y) put2(x), t=U32(size_t(y)), put4r(t), \ - t=U32(size_t(y)>>(S*4)), put4r(t) - -// Assemble ZPAQL in in the HCOMP section of header to rcode, -// but do not write beyond rcode_size. Return the number of -// bytes output or that would have been output. -// Execution starts at rcode[0] and returns 1 if successful or 0 -// in case of a ZPAQL execution error. -int ZPAQL::assemble() { - - // x86? (not foolproof) - const int S=sizeof(char*); // 4 = x86, 8 = x86-64 - U32 t=0x12345678; - if (*(char*)&t!=0x78 || (S!=4 && S!=8)) - error("JIT supported only for x86-32 and x86-64"); - - const U8* hcomp=&header[hbegin]; - const int hlen=hend-hbegin+2; - const int msize=m.size(); - const int hsize=h.size(); - static const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax.. - Array it(hlen); // hcomp -> rcode locations - int done=0; // number of instructions assembled (0..hlen) - int o=5; // rcode output index, reserve space for jmp - - // Code for the halt instruction (restore registers and return) - const int halt=o; - if (S==8) { - put2l(0x48b9, &a); // mov rcx, a - put2(0x8911); // mov [rcx], edx - put2l(0x48b9, &b); // mov rcx, b - put2(0x8931); // mov [rcx], esi - put2l(0x48b9, &c); // mov rcx, c - put2(0x8939); // mov [rcx], edi - put2l(0x48b9, &d); // mov rcx, d - put2(0x8929); // mov [rcx], ebp - put2l(0x48b9, &f); // mov rcx, f - put2(0x8919); // mov [rcx], ebx - put4(0x4883c408); // add rsp, 8 - put2(0x415f); // pop r15 - put2(0x415e); // pop r14 - put2(0x415d); // pop r13 - put2(0x415c); // pop r12 - } - else { - put2a(0x8915, &a); // mov [a], edx - put2a(0x8935, &b); // mov [b], esi - put2a(0x893d, &c); // mov [c], edi - put2a(0x892d, &d); // mov [d], ebp - put2a(0x891d, &f); // mov [f], ebx - put3(0x83c40c); // add esp, 12 - } - put1(0x5b); // pop ebx - put1(0x5f); // pop edi - put1(0x5e); // pop esi - put1(0x5d); // pop ebp - put1(0xc3); // ret - - // Code for the out instruction. - // Store a=edx at outbuf[bufptr++]. If full, call flush1(). - const int outlabel=o; - if (S==8) { - put2l(0x48b8, &outbuf[0]);// mov rax, outbuf.p - put2l(0x49ba, &bufptr); // mov r10, &bufptr - put3(0x418b0a); // mov rcx, [r10] - put3(0x881408); // mov [rax+rcx], dl - put2(0xffc1); // inc rcx - put3(0x41890a); // mov [r10], ecx - put2a(0x81f9, outbuf.size()); // cmp rcx, outbuf.size() - put2(0x7403); // jz L1 - put2(0x31c0); // xor eax, eax - put1(0xc3); // ret - - put1(0x55); // L1: push rbp ; call flush1(this) - put1(0x57); // push rdi - put1(0x56); // push rsi - put1(0x52); // push rdx - put1(0x51); // push rcx - put3(0x4889e5); // mov rbp, rsp - put4(0x4883c570); // add rbp, 112 -#if defined(unix) && !defined(__CYGWIN__) - put2l(0x48bf, this); // mov rdi, this -#else // Windows - put2l(0x48b9, this); // mov rcx, this -#endif - put2l(0x49bb, &flush1); // mov r11, &flush1 - put3(0x41ffd3); // call r11 - put1(0x59); // pop rcx - put1(0x5a); // pop rdx - put1(0x5e); // pop rsi - put1(0x5f); // pop rdi - put1(0x5d); // pop rbp - } - else { - put1a(0xb8, &outbuf[0]); // mov eax, outbuf.p - put2a(0x8b0d, &bufptr); // mov ecx, [bufptr] - put3(0x881408); // mov [eax+ecx], dl - put2(0xffc1); // inc ecx - put2a(0x890d, &bufptr); // mov [bufptr], ecx - put2a(0x81f9, outbuf.size()); // cmp ecx, outbuf.size() - put2(0x7403); // jz L1 - put2(0x31c0); // xor eax, eax - put1(0xc3); // ret - put3(0x83ec0c); // L1: sub esp, 12 - put4(0x89542404); // mov [esp+4], edx - put3a(0xc70424, this); // mov [esp], this - put1a(0xb8, &flush1); // mov eax, &flush1 - put2(0xffd0); // call eax - put4(0x8b542404); // mov edx, [esp+4] - put3(0x83c40c); // add esp, 12 - } - put1(0xc3); // ret - - // Set it[i]=1 for each ZPAQL instruction reachable from the previous - // instruction + 2 if reachable by a jump (or 3 if both). - it[0]=2; - assert(hlen>0 && hcomp[hlen-1]==0); // ends with error - do { - done=0; - const int NONE=0x80000000; - for (int i=0; i>24);// jt,jf,jmp - if (op==63) next1=NONE; // jmp - if ((next2<0 || next2>=hlen) && next2!=NONE) next2=hlen-1; // error - if (next1>=0 && next1=0 && next20); - - // Set it[i] bits 2-3 to 4, 8, or 12 if a comparison - // (==, <, > respectively) does not need to save the result in f, - // or if a conditional jump (jt, jf) does not need to read f. - // This is true if a comparison is followed directly by a jt/jf, - // the jt/jf is not a jump target, the byte before is not a jump - // target (for a 2 byte comparison), and for the comparison instruction - // if both paths after the jt/jf lead to another comparison or error - // before another jt/jf. At most hlen steps are traced because after - // that it must be an infinite loop. - for (int i=0; i=216 && op1<240 && (op2==39 || op2==47) - && it[i2]==1 && (i2==i+1 || it[i+1]==0)) { - int code=(op1-208)/8*4; // 4,8,12 is ==,<,> - it[i2]+=code; // OK to test CF, ZF instead of f - for (int j=0; j<2 && code; ++j) { // trace each path from i2 - int k=i2+2; // branch not taken - if (j==1) k=i2+2+(hcomp[i2+1]<<24>>24); // branch taken - for (int l=0; l=hlen) break; // out of bounds, pass - const int op=hcomp[k]; - if (op==39 || op==47) code=0; // jt,jf, fail - else if (op>=216 && op<240) break; // ==,<,>, pass - else if (iserr(op)) break; // error, pass - else if (op==255) k=hcomp[k+1]+256*hcomp[k+2]; // lj - else if (op==63) k=k+2+(hcomp[k+1]<<24>>24); // jmp - else if (op==56) k=0; // halt - else k=k+1+(op%8==7); // ordinary instruction - } - } - it[i]+=code; // if > 0 then OK to not save flags in f (bl) - } - } - - // Start of run(): Save x86 and load ZPAQL registers - const int start=o; - assert(start>=16); - put1(0x55); // push ebp/rbp - put1(0x56); // push esi/rsi - put1(0x57); // push edi/rdi - put1(0x53); // push ebx/rbx - if (S==8) { - put2(0x4154); // push r12 - put2(0x4155); // push r13 - put2(0x4156); // push r14 - put2(0x4157); // push r15 - put4(0x4883ec08); // sub rsp, 8 - put2l(0x48b8, &a); // mov rax, a - put2(0x8b10); // mov edx, [rax] - put2l(0x48b8, &b); // mov rax, b - put2(0x8b30); // mov esi, [rax] - put2l(0x48b8, &c); // mov rax, c - put2(0x8b38); // mov edi, [rax] - put2l(0x48b8, &d); // mov rax, d - put2(0x8b28); // mov ebp, [rax] - put2l(0x48b8, &f); // mov rax, f - put2(0x8b18); // mov ebx, [rax] - put2l(0x49bc, &h[0]); // mov r12, h - put2l(0x49bd, &outbuf[0]); // mov r13, outbuf.p - put2l(0x49be, &r[0]); // mov r14, r - put2l(0x49bf, &m[0]); // mov r15, m - } - else { - put3(0x83ec0c); // sub esp, 12 - put2a(0x8b15, &a); // mov edx, [a] - put2a(0x8b35, &b); // mov esi, [b] - put2a(0x8b3d, &c); // mov edi, [c] - put2a(0x8b2d, &d); // mov ebp, [d] - put2a(0x8b1d, &f); // mov ebx, [f] - } - - // Assemble in multiple passes until every byte of hcomp has a translation - for (int istart=0; istarti); - assert(i>=0 && i=16) { - if (i>istart) { - int a=code-o; - if (a>-120 && a<120) - put2(0xeb00+((a-2)&255)); // jmp short o - else - put1a(0xe9, a-5); // jmp near o - } - break; - } - - // Else assemble the instruction at hcomp[i] to rcode[o] - else { - assert(i>=0 && i0 && it[i]<16); - assert(o>=16); - it[i]=o; - ++done; - const int op=hcomp[i]; - const int arg=hcomp[i+1]+((op==255)?256*hcomp[i+2]:0); - const int ddd=op/8%8; - const int sss=op%8; - - // error instruction: return 1 - if (iserr(op)) { - put1a(0xb8, 1); // mov eax, 1 - put1a(0xe9, halt-o-4); // jmp near halt - continue; - } - - // Load source *b, *c, *d, or hash (*b) into eax except: - // {a,b,c,d}=*d, a{+,-,*,&,|,^,=,==,>,>}=*d: load address to eax - // {a,b,c,d}={*b,*c}: load source into ddd - if (op==59 || (op>=64 && op<240 && op%8>=4 && op%8<7)) { - put2(0x89c0+8*regcode[sss-3+(op==59)]); // mov eax, {esi,edi,ebp} - const int sz=(sss==6?hsize:msize)-1; - if (sz>=128) put1a(0x25, sz); // and eax, dword msize-1 - else put3(0x83e000+sz); // and eax, byte msize-1 - const int move=(op>=64 && op<112); // = or else ddd is eax - if (sss<6) { // ddd={a,b,c,d,*b,*c} - if (S==8) put5(0x410fb604+8*move*regcode[ddd],0x07); - // movzx ddd, byte [r15+rax] - else put3a(0x0fb680+8*move*regcode[ddd], &m[0]); - // movzx ddd, byte [m+eax] - } - else if ((0x06587000>>(op/8))&1) {// {*b,*c,*d,a/,a%,a&~,a<<,a>>}=*d - if (S==8) put4(0x418b0484); // mov eax, [r12+rax*4] - else put3a(0x8b0485, &h[0]); // mov eax, [h+eax*4] - } - } - - // Load destination address *b, *c, *d or hashd (*d) into ecx - if ((op>=32 && op<56 && op%8<5) || (op>=96 && op<120) || op==60) { - put2(0x89c1+8*regcode[op/8%8-3-(op==60)]);// mov ecx,{esi,edi,ebp} - const int sz=(ddd==6||op==60?hsize:msize)-1; - if (sz>=128) put2a(0x81e1, sz); // and ecx, dword sz - else put3(0x83e100+sz); // and ecx, byte sz - if (op/8%8==6 || op==60) { // *d - if (S==8) put4(0x498d0c8c); // lea rcx, [r12+rcx*4] - else put3a(0x8d0c8d, &h[0]); // lea ecx, [ecx*4+h] - } - else { // *b, *c - if (S==8) put4(0x498d0c0f); // lea rcx, [r15+rcx] - else put2a(0x8d89, &m[0]); // lea ecx, [ecx+h] - } - } - - // Translate by opcode - switch((op/8)&31) { - case 0: // ddd = a - case 1: // ddd = b - case 2: // ddd = c - case 3: // ddd = d - switch(sss) { - case 0: // ddd<>a (swap) - put2(0x87d0+regcode[ddd]); // xchg edx, ddd - break; - case 1: // ddd++ - put3(0x83c000+256*regcode[ddd]+inc); // add ddd, inc - break; - case 2: // ddd-- - put3(0x83e800+256*regcode[ddd]+inc); // sub ddd, inc - break; - case 3: // ddd! - put2(0xf7d0+regcode[ddd]); // not ddd - break; - case 4: // ddd=0 - put2(0x31c0+9*regcode[ddd]); // xor ddd,ddd - break; - case 7: // ddd=r n - if (S==8) - put3a(0x418b86+8*regcode[ddd], arg*4); // mov ddd, [r14+n*4] - else - put2a(0x8b05+8*regcode[ddd], (&r[arg]));//mov ddd, [r+n] - break; - } - break; - case 4: // ddd = *b - case 5: // ddd = *c - switch(sss) { - case 0: // ddd<>a (swap) - put2(0x8611); // xchg dl, [ecx] - break; - case 1: // ddd++ - put3(0x800100+inc); // add byte [ecx], inc - break; - case 2: // ddd-- - put3(0x802900+inc); // sub byte [ecx], inc - break; - case 3: // ddd! - put2(0xf611); // not byte [ecx] - break; - case 4: // ddd=0 - put2(0x31c0); // xor eax, eax - put2(0x8801); // mov [ecx], al - break; - case 7: // jt, jf - { - assert(code>=0 && code<16); - static const unsigned char jtab[2][4]={{5,4,2,7},{4,5,3,6}}; - // jnz,je,jb,ja, jz,jne,jae,jbe - if (code<4) put2(0x84db); // test bl, bl - if (arg>=128 && arg-257-i>=0 && o-it[arg-257-i]<120) - put2(0x7000+256*jtab[op==47][code/4]); // jx short 0 - else - put2a(0x0f80+jtab[op==47][code/4], 0); // jx near 0 - break; - } - } - break; - case 6: // ddd = *d - switch(sss) { - case 0: // ddd<>a (swap) - put2(0x8711); // xchg edx, [ecx] - break; - case 1: // ddd++ - put3(0x830100+inc); // add dword [ecx], inc - break; - case 2: // ddd-- - put3(0x832900+inc); // sub dword [ecx], inc - break; - case 3: // ddd! - put2(0xf711); // not dword [ecx] - break; - case 4: // ddd=0 - put2(0x31c0); // xor eax, eax - put2(0x8901); // mov [ecx], eax - break; - case 7: // ddd=r n - if (S==8) - put3a(0x418996, arg*4); // mov [r14+n*4], edx - else - put2a(0x8915, &r[arg]); // mov [r+n], edx - break; - } - break; - case 7: // special - switch(op) { - case 56: // halt - put2(0x31c0); // xor eax, eax ; return 0 - put1a(0xe9, halt-o-4); // jmp near halt - break; - case 57: // out - put1a(0xe8, outlabel-o-4);// call outlabel - put3(0x83f800); // cmp eax, 0 ; returned error code - put2(0x7405); // je L1: - put1a(0xe9, halt-o-4); // jmp near halt ; L1: - break; - case 59: // hash: a = (a + *b + 512) * 773 - put3a(0x8d8410, 512); // lea edx, [eax+edx+512] - put2a(0x69d0, 773); // imul edx, eax, 773 - break; - case 60: // hashd: *d = (*d + a + 512) * 773 - put2(0x8b01); // mov eax, [ecx] - put3a(0x8d8410, 512); // lea eax, [eax+edx+512] - put2a(0x69c0, 773); // imul eax, eax, 773 - put2(0x8901); // mov [ecx], eax - break; - case 63: // jmp - put1a(0xe9, 0); // jmp near 0 (fill in target later) - break; - } - break; - case 8: // a= - case 9: // b= - case 10: // c= - case 11: // d= - if (sss==7) // n - put1a(0xb8+regcode[ddd], arg); // mov ddd, n - else if (sss==6) { // *d - if (S==8) - put4(0x418b0484+(regcode[ddd]<<11)); // mov ddd, [r12+rax*4] - else - put3a(0x8b0485+(regcode[ddd]<<11),&h[0]);// mov ddd, [h+eax*4] - } - else if (sss<4) // a, b, c, d - put2(0x89c0+regcode[ddd]+8*regcode[sss]);// mov ddd,sss - break; - case 12: // *b= - case 13: // *c= - if (sss==7) put3(0xc60100+arg); // mov byte [ecx], n - else if (sss==0) put2(0x8811); // mov byte [ecx], dl - else { - if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss - put2(0x8801); // mov byte [ecx], al - } - break; - case 14: // *d= - if (sss<7) put2(0x8901+8*regcode[sss]); // mov [ecx], sss - else put2a(0xc701, arg); // mov dword [ecx], n - break; - case 15: break; // not used - case 16: // a+= - if (sss==6) { - if (S==8) put4(0x41031484); // add edx, [r12+rax*4] - else put3a(0x031485, &h[0]); // add edx, [h+eax*4] - } - else if (sss<7) put2(0x01c2+8*regcode[sss]);// add edx, sss - else if (arg>=128) put2a(0x81c2, arg); // add edx, n - else put3(0x83c200+arg); // add edx, byte n - break; - case 17: // a-= - if (sss==6) { - if (S==8) put4(0x412b1484); // sub edx, [r12+rax*4] - else put3a(0x2b1485, &h[0]); // sub edx, [h+eax*4] - } - else if (sss<7) put2(0x29c2+8*regcode[sss]);// sub edx, sss - else if (arg>=128) put2a(0x81ea, arg); // sub edx, n - else put3(0x83ea00+arg); // sub edx, byte n - break; - case 18: // a*= - if (sss==6) { - if (S==8) put5(0x410faf14,0x84); // imul edx, [r12+rax*4] - else put4a(0x0faf1485, &h[0]); // imul edx, [h+eax*4] - } - else if (sss<7) put3(0x0fafd0+regcode[sss]);// imul edx, sss - else if (arg>=128) put2a(0x69d2, arg); // imul edx, n - else put3(0x6bd200+arg); // imul edx, byte n - break; - case 19: // a/= - case 20: // a%= - if (sss<7) put2(0x89c1+8*regcode[sss]); // mov ecx, sss - else put1a(0xb9, arg); // mov ecx, n - put2(0x85c9); // test ecx, ecx - put3(0x0f44d1); // cmovz edx, ecx - put2(0x7408-2*(op/8==20)); // jz (over rest) - put2(0x89d0); // mov eax, edx - put2(0x31d2); // xor edx, edx - put2(0xf7f1); // div ecx - if (op/8==19) put2(0x89c2); // mov edx, eax - break; - case 21: // a&= - if (sss==6) { - if (S==8) put4(0x41231484); // and edx, [r12+rax*4] - else put3a(0x231485, &h[0]); // and edx, [h+eax*4] - } - else if (sss<7) put2(0x21c2+8*regcode[sss]);// and edx, sss - else if (arg>=128) put2a(0x81e2, arg); // and edx, n - else put3(0x83e200+arg); // and edx, byte n - break; - case 22: // a&~ - if (sss==7) { - if (arg<128) put3(0x83e200+(~arg&255));// and edx, byte ~n - else put2a(0x81e2, ~arg); // and edx, ~n - } - else { - if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss - put2(0xf7d0); // not eax - put2(0x21c2); // and edx, eax - } - break; - case 23: // a|= - if (sss==6) { - if (S==8) put4(0x410b1484); // or edx, [r12+rax*4] - else put3a(0x0b1485, &h[0]); // or edx, [h+eax*4] - } - else if (sss<7) put2(0x09c2+8*regcode[sss]);// or edx, sss - else if (arg>=128) put2a(0x81ca, arg); // or edx, n - else put3(0x83ca00+arg); // or edx, byte n - break; - case 24: // a^= - if (sss==6) { - if (S==8) put4(0x41331484); // xor edx, [r12+rax*4] - else put3a(0x331485, &h[0]); // xor edx, [h+eax*4] - } - else if (sss<7) put2(0x31c2+8*regcode[sss]);// xor edx, sss - else if (arg>=128) put2a(0x81f2, arg); // xor edx, byte n - else put3(0x83f200+arg); // xor edx, n - break; - case 25: // a<<= - case 26: // a>>= - if (sss==7) // sss = n - put3(0xc1e200+8*256*(op/8==26)+arg); // shl/shr n - else { - put2(0x89c1+8*regcode[sss]); // mov ecx, sss - put2(0xd3e2+8*(op/8==26)); // shl/shr edx, cl - } - break; - case 27: // a== - case 28: // a< - case 29: // a> - if (sss==6) { - if (S==8) put4(0x413b1484); // cmp edx, [r12+rax*4] - else put3a(0x3b1485, &h[0]); // cmp edx, [h+eax*4] - } - else if (sss==7) // sss = n - put2a(0x81fa, arg); // cmp edx, dword n - else - put2(0x39c2+8*regcode[sss]); // cmp edx, sss - if (code<4) { - if (op/8==27) put3(0x0f94c3); // setz bl - if (op/8==28) put3(0x0f92c3); // setc bl - if (op/8==29) put3(0x0f97c3); // seta bl - } - break; - case 30: // not used - case 31: // 255 = lj - if (op==255) put1a(0xe9, 0); // jmp near - break; - } - } - } - } - - // Finish first pass - const int rsize=o; - if (o>rcode_size) return rsize; - - // Fill in jump addresses (second pass) - for (int i=0; i=128) target-=256; - target+=i+2; - } - if (target<0 || target>=hlen) target=hlen-1; // runtime ZPAQL error - o=it[i]; - assert(o>=16 && o skip test - assert(o>=16 && o=0x72 && op<0x78) || op==0xeb) { // jx, jmp short - --target; - if (target<-128 || target>127) - error("Cannot code x86 short jump"); - assert(o=0x82 && op<0x88) || op==0xe9) // jx, jmp near - { - target-=4; - puta(target); - } - else assert(false); // not a x86 jump - } - } - - // Jump to start - o=0; - put1a(0xe9, start-5); // jmp near start - return rsize; -} - -//////////////////////// Predictor::assemble_p() ///////////////////// - -// Assemble the ZPAQL code in the HCOMP section of z.header to pcomp and -// return the number of bytes of x86 or x86-64 code written, or that would -// be written if pcomp were large enough. The code for predict() begins -// at pr.pcomp[0] and update() at pr.pcomp[5], both as jmp instructions. - -// The assembled code is equivalent to int predict(Predictor*) -// and void update(Predictor*, int y); The Preditor address is placed in -// edi/rdi. The update bit y is placed in ebp/rbp. - -int Predictor::assemble_p() { - Predictor& pr=*this; - U8* rcode=pr.pcode; // x86 output array - int rcode_size=pcode_size; // output size - int o=0; // output index in pcode - const int S=sizeof(char*); // 4 or 8 - U8* hcomp=&pr.z.header[0]; // The code to translate -#define off(x) ((char*)&(pr.x)-(char*)&pr) -#define offc(x) ((char*)&(pr.comp[i].x)-(char*)&pr) - - // test for little-endian (probably x86) - U32 t=0x12345678; - if (*(char*)&t!=0x78 || (S!=4 && S!=8)) - error("JIT supported only for x86-32 and x86-64"); - - // Initialize for predict(). Put predictor address in edi/rdi - put1a(0xe9, 5); // jmp predict - put1a(0, 0x90909000); // reserve space for jmp update - put1(0x53); // push ebx/rbx - put1(0x55); // push ebp/rbp - put1(0x56); // push esi/rsi - put1(0x57); // push edi/rdi - if (S==4) - put4(0x8b7c2414); // mov edi,[esp+0x14] ; pr - else { -#if !defined(unix) || defined(__CYGWIN__) - put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) -#endif - } - - // Code predict() for each component - const int n=hcomp[6]; // number of components - U8* cp=hcomp+7; - for (int i=0; i=pr.z.cend) error("comp too big"); - if (cp[0]<1 || cp[0]>9) error("invalid component"); - assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); - switch (cp[0]) { - - case CONS: // c - break; - - case CM: // sizebits limit - // Component& cr=comp[i]; - // cr.cxt=h[i]^hmap4; - // p[i]=stretch(cr.cm(cr.cxt)>>17); - - put2a(0x8b87, off(h[i])); // mov eax, [edi+&h[i]] - put2a(0x3387, off(hmap4)); // xor eax, [edi+&hmap4] - put1a(0x25, (1<rsi) - put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] - put3(0x8b0486); // mov eax, [esi+eax*4] - put3(0xc1e811); // shr eax, 17 - put4a(0x0fbf8447, off(stretcht)); // movsx eax,word[edi+eax*2+..] - put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax - break; - - case ISSE: // sizebits j -- c=hi, cxt=bh - // assert((hmap4&15)>0); - // if (c8==1 || (c8&0xf0)==16) - // cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - // cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history - // int *wt=(int*)&cr.cm[cr.cxt*2]; - // p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); - - case ICM: // sizebits - // assert((hmap4&15)>0); - // if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); - // cr.cxt=cr.ht[cr.c+(hmap4&15)]; - // p[i]=stretch(cr.cm(cr.cxt)>>8); - // - // Find cxt row in hash table ht. ht has rows of 16 indexed by the low - // sizebits of cxt with element 0 having the next higher 8 bits for - // collision detection. If not found after 3 adjacent tries, replace - // row with lowest element 1 as priority. Return index of row. - // - // size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { - // assert(ht.size()==size_t(16)<>sizebits&255; - // size_t h0=(cxt*16)&(ht.size()-16); - // if (ht[h0]==chk) return h0; - // size_t h1=h0^16; - // if (ht[h1]==chk) return h1; - // size_t h2=h0^32; - // if (ht[h2]==chk) return h2; - // if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) - // return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; - // else if (ht[h1+1]>(7-cr.cxt))&1; // predicted bit - // p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); - // } - - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] - - // If match length (a) is 0 then p[i]=0 - put2a(0x8b87, offc(a)); // mov eax, [edi+&a] - put2(0x85c0); // test eax, eax - put2(0x7449); // jz L2 ; p[i]=0 - - // Else put predicted bit in c - put1a(0xb9, 7); // mov ecx, 7 - put2a(0x2b8f, offc(cxt)); // sub ecx, [edi+&cxt] - put2a(0x8b87, offc(limit)); // mov eax, [edi+&limit] - put2a(0x2b87, offc(b)); // sub eax, [edi+&b] - put1a(0x25, (1<>8; - - put2a(0x8b87, off(p[cp[1]])); // mov eax, [edi+&p[j]] - put2a(0x2b87, off(p[cp[2]])); // sub eax, [edi+&p[k]] - put2a(0x69c0, cp[3]); // imul eax, wt - put3(0xc1f808); // sar eax, 8 - put2a(0x0387, off(p[cp[2]])); // add eax, [edi+&p[k]] - put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax - break; - - case MIX2: // sizebits j k rate mask - // c=size cm=wt[size] cxt=input - // cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); - // assert(cr.cxt=0 && w<65536); - // p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; - // assert(p[i]>=-2048 && p[i]<2048); - - put2(0x8b07); // mov eax, [edi] ; c8 - put1a(0x25, cp[5]); // and eax, mask - put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] - put1a(0x25, (1<=1 && m<=i); - // cr.cxt=h[i]+(c8&cp[5]); - // cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights - // assert(cr.cxt<=cr.cm.size()-m); - // int* wt=(int*)&cr.cm[cr.cxt]; - // p[i]=0; - // for (int j=0; j>8)*p[cp[2]+j]; - // p[i]=clamp2k(p[i]>>8); - - put2(0x8b07); // mov eax, [edi] ; c8 - put1a(0x25, cp[5]); // and eax, mask - put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] - put1a(0x25, (1<3) put4a(0xf30f6f96, k*4+16);//movdqu xmm2, [esi+k*4+16] - put5(0x660f72e1,0x08); // psrad xmm1, 8 - if (tail>3) put5(0x660f72e2,0x08); // psrad xmm2, 8 - put4(0x660f6bca); // packssdw xmm1, xmm2 - put4a(0xf30f6f9f, off(p[cp[2]+k])); // movdqu xmm3, [edi+&p[j+k]] - if (tail>3) - put4a(0xf30f6fa7,off(p[cp[2]+k+4]));//movdqu xmm4, [edi+&p[j+k+4]] - put4(0x660f6bdc); // packssdw, xmm3, xmm4 - if (tail>0 && tail<8) { // last loop, mask extra weights - put4(0x660f76ed); // pcmpeqd xmm5, xmm5 ; -1 - put5(0x660f73dd, 16-tail*2); // psrldq xmm5, 16-tail*2 - put4(0x660fdbcd); // pand xmm1, xmm5 - } - if (k==0) { // first loop, initialize sum in xmm0 - put4(0xf30f6fc1); // movdqu xmm0, xmm1 - put4(0x660ff5c3); // pmaddwd xmm0, xmm3 - } - else { // accumulate sum in xmm0 - put4(0x660ff5cb); // pmaddwd xmm1, xmm3 - put4(0x660ffec1); // paddd xmm0, xmm1 - } - } - - // Add up the 4 elements of xmm0 = p[i] in the first element - put4(0xf30f6fc8); // movdqu xmm1, xmm0 - put5(0x660f73d9,0x08); // psrldq xmm1, 8 - put4(0x660ffec1); // paddd xmm0, xmm1 - put4(0xf30f6fc8); // movdqu xmm1, xmm0 - put5(0x660f73d9,0x04); // psrldq xmm1, 4 - put4(0x660ffec1); // paddd xmm0, xmm1 - put4(0x660f7ec0); // movd eax, xmm0 ; p[i] - put3(0xc1f808); // sar eax, 8 - put1a(0x3d, 2047); // cmp eax, 2047 - put2(0x7e05); // jle L1 - put1a(0xb8, 2047); // mov eax, 2047 - put1a(0x3d, -2048); // L1: cmp eax, -2048 - put2(0x7d05); // jge, L2 - put1a(0xb8, -2048); // mov eax, -2048 - put2a(0x8987, off(p[i])); // L2: mov [edi+&p[i]], eax - break; - - case SSE: // sizebits j start limit - // cr.cxt=(h[i]+c8)*32; - // int pq=p[cp[2]]+992; - // if (pq<0) pq=0; - // if (pq>1983) pq=1983; - // int wt=pq&63; - // pq>>=6; - // assert(pq>=0 && pq<=30); - // cr.cxt+=pq; - // p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt) // p0 - // +(cr.cm(cr.cxt+1)>>10)*wt)>>13); // p1 - // // p = p0*(64-wt)+p1*wt = (p1-p0)*wt + p0*64 - // cr.cxt+=wt>>5; - - put2a(0x8b8f, off(h[i])); // mov ecx, [edi+&h[i]] - put2(0x030f); // add ecx, [edi] ; c0 - put2a(0x81e1, (1<>5 - put2a(0x898f, offc(cxt)); // mov [edi+cxt], ecx ; cxt saved - put3(0xc1e80a); // shr eax, 10 ; p0 = cm[cxt]>>10 - put3(0xc1eb0a); // shr ebx, 10 ; p1 = cm[cxt+1]>>10 - put2(0x29c3); // sub ebx, eax, ; p1-p0 - put3(0x0fafda); // imul ebx, edx ; (p1-p0)*wt - put3(0xc1e006); // shr eax, 6 - put2(0x01d8); // add eax, ebx ; p in 0..2^28-1 - put3(0xc1e80d); // shr eax, 13 ; p in 0..32767 - put4a(0x0fbf8447, off(stretcht)); // movsx eax, word [edi+eax*2+...] - put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax - break; - - default: - error("invalid ZPAQ component"); - } - } - - // return squash(p[n-1]) - put2a(0x8b87, off(p[n-1])); // mov eax, [edi+...] - put1a(0x05, 0x800); // add eax, 2048 - put4a(0x0fbf8447, off(squasht[0])); // movsx eax, word [edi+eax*2+...] - put1(0x5f); // pop edi - put1(0x5e); // pop esi - put1(0x5d); // pop ebp - put1(0x5b); // pop ebx - put1(0xc3); // ret - - // Initialize for update() Put predictor address in edi/rdi - // and bit y=0..1 in ebp - int save_o=o; - o=5; - put1a(0xe9, save_o-10); // jmp update - o=save_o; - put1(0x53); // push ebx/rbx - put1(0x55); // push ebp/rbp - put1(0x56); // push esi/rsi - put1(0x57); // push edi/rdi - if (S==4) { - put4(0x8b7c2414); // mov edi,[esp+0x14] ; (1st arg = pr) - put4(0x8b6c2418); // mov ebp,[esp+0x18] ; (2nd arg = y) - } - else { -#if defined(unix) && !defined(__CYGWIN__) // (1st arg already in rdi) - put3(0x4889f5); // mov rbp, rsi (2nd arg in Linux-64) -#else - put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) - put3(0x4889d5); // mov rbp, rdx (2nd arg) -#endif - } - - // Code update() for each component - cp=hcomp+7; - for (int i=0; i=1 && cp[0]<=9); - assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); - switch (cp[0]) { - - case CONS: // c - break; - - case SSE: // sizebits j start limit - case CM: // sizebits limit - // train(cr, y); - // - // reduce prediction error in cr.cm - // void train(Component& cr, int y) { - // assert(y==0 || y==1); - // U32& pn=cr.cm(cr.cxt); - // U32 count=pn&0x3ff; - // int error=y*32767-(cr.cm(cr.cxt)>>17); - // pn+=(error*dt[count]&-1024)+(countrsi) - put2a(0x8bb7, offc(cm)); // mov esi,[edi+cm] ; cm - put2a(0x8b87, offc(cxt)); // mov eax,[edi+cxt] ; cxt - put1a(0x25, pr.comp[i].cm.size()-1); // and eax, size-1 - if (S==8) put1(0x48); // rex.w - put3(0x8d3486); // lea esi,[esi+eax*4] ; &cm[cxt] - put2(0x8b06); // mov eax,[esi] ; cm[cxt] - put2(0x89c2); // mov edx, eax ; cm[cxt] - put3(0xc1e811); // shr eax, 17 ; cm[cxt]>>17 - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 ; y*32768 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax ; error - put2a(0x81e2, 0x3ff); // and edx, 1023 ; count - put3a(0x8b8497, off(dt)); // mov eax,[edi+edx*4+dt] ; dt[count] - put3(0x0fafc8); // imul ecx, eax ; error*dt[count] - put2a(0x81e1, 0xfffffc00); // and ecx, -1024 - put2a(0x81fa, cp[2+2*(cp[0]==SSE)]*4); // cmp edx, limit*4 - put2(0x110e); // adc [esi], ecx ; pn+=... - break; - - case ICM: // sizebits: cxt=bh, ht[c][0..15]=bh row - // cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y); - // U32& pn=cr.cm(cr.cxt); - // pn+=int(y*32767-(pn>>8))>>2; - - case ISSE: // sizebits j -- c=hi, cxt=bh - // assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); - // int err=y*32767-squash(p[i]); - // int *wt=(int*)&cr.cm[cr.cxt*2]; - // wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); - // wt[1]=clamp512k(wt[1]+((err+16)>>5)); - // cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); - - // update bit history bh to next(bh,y=ebp) in ht[c+(hmap4&15)] - put3(0x8b4700+off(hmap4)); // mov eax, [edi+&hmap4] - put3(0x83e00f); // and eax, 15 - put2a(0x0387, offc(c)); // add eax [edi+&c] ; cxt - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] - put4(0x0fb61406); // movzx edx, byte [esi+eax] ; bh - put4(0x8d5c9500); // lea ebx, [ebp+edx*4] ; index to st - put4a(0x0fb69c1f, off(st)); // movzx ebx,byte[edi+ebx+st]; next bh - put3(0x881c06); // mov [esi+eax], bl ; save next bh - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] - - // ICM: update cm[cxt=edx=bit history] to reduce prediction error - // esi = &cm - if (cp[0]==ICM) { - if (S==8) put1(0x48); // rex.w - put3(0x8d3496); // lea esi, [esi+edx*4] ; &cm[bh] - put2(0x8b06); // mov eax, [esi] ; pn - put3(0xc1e808); // shr eax, 8 ; pn>>8 - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax - put3(0xc1f902); // sar ecx, 2 - put2(0x010e); // add [esi], ecx - } - - // ISSE: update weights. edx=cxt=bit history (0..255), esi=cm[512] - else { - put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] - put1a(0x05, 2048); // add eax, 2048 - put4a(0x0fb78447, off(squasht)); // movzx eax, word [edi+eax*2+..] - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax ; err - put2a(0x8b87, off(p[cp[2]]));// mov eax, [edi+&p[j]] - put3(0x0fafc1); // imul eax, ecx - put1a(0x05, (1<<12)); // add eax, 4096 - put3(0xc1f80d); // sar eax, 13 - put3(0x0304d6); // add eax, [esi+edx*8] ; wt[0] - put1a(0x3d, (1<<19)-1); // cmp eax, (1<<19)-1 - put2(0x7e05); // jle L1 - put1a(0xb8, (1<<19)-1); // mov eax, (1<<19)-1 - put1a(0x3d, 0xfff80000); // cmp eax, -1<<19 - put2(0x7d05); // jge L2 - put1a(0xb8, 0xfff80000); // mov eax, -1<<19 - put3(0x8904d6); // L2: mov [esi+edx*8], eax - put3(0x83c110); // add ecx, 16 ; err - put3(0xc1f905); // sar ecx, 5 - put4(0x034cd604); // add ecx, [esi+edx*8+4] ; wt[1] - put2a(0x81f9, (1<<19)-1); // cmp ecx, (1<<19)-1 - put2(0x7e05); // jle L3 - put1a(0xb9, (1<<19)-1); // mov ecx, (1<<19)-1 - put2a(0x81f9, 0xfff80000); // cmp ecx, -1<<19 - put2(0x7d05); // jge L4 - put1a(0xb9, 0xfff80000); // mov ecx, -1<<19 - put4(0x894cd604); // L4: mov [esi+edx*8+4], ecx - } - break; - - case MATCH: // sizebits bufbits: - // a=len, b=offset, c=bit, cm=index, cxt=bitpos - // ht=buf, limit=pos - // assert(cr.a<=255); - // assert(cr.c==0 || cr.c==1); - // assert(cr.cxt<8); - // assert(cr.cm.size()==(size_t(1)<>5; - // int w=cr.a16[cr.cxt]; - // w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; - // if (w<0) w=0; - // if (w>65535) w=65535; - // cr.a16[cr.cxt]=w; - - // set ecx=err - put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] - put1a(0x05, 2048); // add eax, 2048 - put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax - put2a(0x69c9, cp[4]); // imul ecx, rate - put3(0xc1f905); // sar ecx, 5 ; err - - // Update w - put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(a16)); // mov esi, [edi+&a16] - if (S==8) put1(0x48); // rex.w - put3(0x8d3446); // lea esi, [esi+eax*2] ; &w - put2a(0x8b87, off(p[cp[2]])); // mov eax, [edi+&p[j]] - put2a(0x2b87, off(p[cp[3]])); // sub eax, [edi+&p[k]] ; p[j]-p[k] - put3(0x0fafc1); // imul eax, ecx ; * err - put1a(0x05, 1<<12); // add eax, 4096 - put3(0xc1f80d); // sar eax, 13 - put3(0x0fb716); // movzx edx, word [esi] ; w - put2(0x01d0); // add eax, edx - put1a(0xba, 0xffff); // mov edx, 65535 - put2(0x39d0); // cmp eax, edx - put3(0x0f4fc2); // cmovg eax, edx - put2(0x31d2); // xor edx, edx - put2(0x39d0); // cmp eax, edx - put3(0x0f4cc2); // cmovl eax, edx - put3(0x668906); // mov word [esi], ax - break; - - case MIX: // sizebits j m rate mask - // cm=wt[size][m], cxt=input - // int m=cp[3]; - // assert(m>0 && m<=i); - // assert(cr.cm.size()==m*cr.c); - // assert(cr.cxt+m<=cr.cm.size()); - // int err=(y*32767-squash(p[i]))*cp[4]>>4; - // int* wt=(int*)&cr.cm[cr.cxt]; - // for (int j=0; j>13)); - - // set ecx=err - put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] - put1a(0x05, 2048); // add eax, 2048 - put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] - put2(0x89e9); // mov ecx, ebp ; y - put3(0xc1e10f); // shl ecx, 15 - put2(0x29e9); // sub ecx, ebp ; y*32767 - put2(0x29c1); // sub ecx, eax - put2a(0x69c9, cp[4]); // imul ecx, rate - put3(0xc1f904); // sar ecx, 4 ; err - - // set esi=wt - put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] ; cxt - if (S==8) put1(0x48); // rex.w - put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] - if (S==8) put1(0x48); // rex.w - put3(0x8d3486); // lea esi, [esi+eax*4] ; wt - - for (int k=0; kpcode_size) { - allocx(pcode, pcode_size, n); - n=assemble_p(); - } - if (!pcode || n<15 || pcode_size<15) - error("run JIT failed"); - } - assert(pcode && pcode[0]); - return ((int(*)(Predictor*))&pcode[10])(this); -#endif -} - -// Update the model with bit y = 0..1 -// Use the JIT code starting at pcode[5]. -void Predictor::update(int y) { -#ifdef NOJIT - update0(y); -#else - assert(pcode && pcode[5]); - ((void(*)(Predictor*, int))&pcode[5])(this, y); - - // Save bit y in c8, hmap4 (not implemented in JIT) - c8+=c8+y; - if (c8>=256) { - z.run(c8-256); - hmap4=1; - c8=1; - for (int i=0; i=16 && c8<32) - hmap4=(hmap4&0xf)<<5|y<<4|1; - else - hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); -#endif -} - -// Execute the ZPAQL code with input byte or -1 for EOF. -// Use JIT code at rcode if available, or else create it. -void ZPAQL::run(U32 input) { -#ifdef NOJIT - run0(input); -#else - if (!rcode) { - allocx(rcode, rcode_size, (hend*10+4096)&-4096); - int n=assemble(); - if (n>rcode_size) { - allocx(rcode, rcode_size, n); - n=assemble(); - } - if (!rcode || n<10 || rcode_size<10) - error("run JIT failed"); - } - a=input; - const U32 rc=((int(*)())(&rcode[0]))(); - if (rc==0) return; - else if (rc==1) libzpaq::error("Bad ZPAQL opcode"); - else if (rc==2) libzpaq::error("Out of memory"); - else if (rc==3) libzpaq::error("Write error"); - else libzpaq::error("ZPAQL execution error"); -#endif -} - -////////////////////////// divsufsort /////////////////////////////// - -/* - * divsufsort.c for libdivsufsort-lite - * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/*- Constants -*/ -#define INLINE __inline -#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) -# undef ALPHABET_SIZE -#endif -#if !defined(ALPHABET_SIZE) -# define ALPHABET_SIZE (256) -#endif -#define BUCKET_A_SIZE (ALPHABET_SIZE) -#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) -#if defined(SS_INSERTIONSORT_THRESHOLD) -# if SS_INSERTIONSORT_THRESHOLD < 1 -# undef SS_INSERTIONSORT_THRESHOLD -# define SS_INSERTIONSORT_THRESHOLD (1) -# endif -#else -# define SS_INSERTIONSORT_THRESHOLD (8) -#endif -#if defined(SS_BLOCKSIZE) -# if SS_BLOCKSIZE < 0 -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (0) -# elif 32768 <= SS_BLOCKSIZE -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (32767) -# endif -#else -# define SS_BLOCKSIZE (1024) -#endif -/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ -#if SS_BLOCKSIZE == 0 -# define SS_MISORT_STACKSIZE (96) -#elif SS_BLOCKSIZE <= 4096 -# define SS_MISORT_STACKSIZE (16) -#else -# define SS_MISORT_STACKSIZE (24) -#endif -#define SS_SMERGE_STACKSIZE (32) -#define TR_INSERTIONSORT_THRESHOLD (8) -#define TR_STACKSIZE (64) - - -/*- Macros -*/ -#ifndef SWAP -# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0) -#endif /* SWAP */ -#ifndef MIN -# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b)) -#endif /* MIN */ -#ifndef MAX -# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b)) -#endif /* MAX */ -#define STACK_PUSH(_a, _b, _c, _d)\ - do {\ - assert(ssize < STACK_SIZE);\ - stack[ssize].a = (_a), stack[ssize].b = (_b),\ - stack[ssize].c = (_c), stack[ssize++].d = (_d);\ - } while(0) -#define STACK_PUSH5(_a, _b, _c, _d, _e)\ - do {\ - assert(ssize < STACK_SIZE);\ - stack[ssize].a = (_a), stack[ssize].b = (_b),\ - stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ - } while(0) -#define STACK_POP(_a, _b, _c, _d)\ - do {\ - assert(0 <= ssize);\ - if(ssize == 0) { return; }\ - (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ - (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ - } while(0) -#define STACK_POP5(_a, _b, _c, _d, _e)\ - do {\ - assert(0 <= ssize);\ - if(ssize == 0) { return; }\ - (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ - (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ - } while(0) -#define BUCKET_A(_c0) bucket_A[(_c0)] -#if ALPHABET_SIZE == 256 -#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) -#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) -#else -#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) -#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) -#endif - - -/*- Private Functions -*/ - -static const int lg_table[256]= { - -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, - 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 -}; - -#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) - -static INLINE -int -ss_ilg(int n) { -#if SS_BLOCKSIZE == 0 - return (n & 0xffff0000) ? - ((n & 0xff000000) ? - 24 + lg_table[(n >> 24) & 0xff] : - 16 + lg_table[(n >> 16) & 0xff]) : - ((n & 0x0000ff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]); -#elif SS_BLOCKSIZE < 256 - return lg_table[n]; -#else - return (n & 0xff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]; -#endif -} - -#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ - -#if SS_BLOCKSIZE != 0 - -static const int sqq_table[256] = { - 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, - 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, - 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, -110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, -128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, -143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, -156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, -169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, -181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, -192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, -202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, -212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, -221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, -230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, -239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, -247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 -}; - -static INLINE -int -ss_isqrt(int x) { - int y, e; - - if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } - e = (x & 0xffff0000) ? - ((x & 0xff000000) ? - 24 + lg_table[(x >> 24) & 0xff] : - 16 + lg_table[(x >> 16) & 0xff]) : - ((x & 0x0000ff00) ? - 8 + lg_table[(x >> 8) & 0xff] : - 0 + lg_table[(x >> 0) & 0xff]); - - if(e >= 16) { - y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); - if(e >= 24) { y = (y + 1 + x / y) >> 1; } - y = (y + 1 + x / y) >> 1; - } else if(e >= 8) { - y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; - } else { - return sqq_table[x] >> 4; - } - - return (x < (y * y)) ? y - 1 : y; -} - -#endif /* SS_BLOCKSIZE != 0 */ - - -/*---------------------------------------------------------------------------*/ - -/* Compares two suffixes. */ -static INLINE -int -ss_compare(const unsigned char *T, - const int *p1, const int *p2, - int depth) { - const unsigned char *U1, *U2, *U1n, *U2n; - - for(U1 = T + depth + *p1, - U2 = T + depth + *p2, - U1n = T + *(p1 + 1) + 2, - U2n = T + *(p2 + 1) + 2; - (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); - ++U1, ++U2) { - } - - return U1 < U1n ? - (U2 < U2n ? *U1 - *U2 : 1) : - (U2 < U2n ? -1 : 0); -} - - -/*---------------------------------------------------------------------------*/ - -#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) - -/* Insertionsort for small size groups */ -static -void -ss_insertionsort(const unsigned char *T, const int *PA, - int *first, int *last, int depth) { - int *i, *j; - int t; - int r; - - for(i = last - 2; first <= i; --i) { - for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) { - do { *(j - 1) = *j; } while((++j < last) && (*j < 0)); - if(last <= j) { break; } - } - if(r == 0) { *j = ~*j; } - *(j - 1) = t; - } -} - -#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */ - - -/*---------------------------------------------------------------------------*/ - -#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) - -static INLINE -void -ss_fixdown(const unsigned char *Td, const int *PA, - int *SA, int i, int size) { - int j, k; - int v; - int c, d, e; - - for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { - d = Td[PA[SA[k = j++]]]; - if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; } - if(d <= c) { break; } - } - SA[i] = v; -} - -/* Simple top-down heapsort. */ -static -void -ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) { - int i, m; - int t; - - m = size; - if((size % 2) == 0) { - m--; - if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); } - } - - for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); } - if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); } - for(i = m - 1; 0 < i; --i) { - t = SA[0], SA[0] = SA[i]; - ss_fixdown(Td, PA, SA, 0, i); - SA[i] = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Returns the median of three elements. */ -static INLINE -int * -ss_median3(const unsigned char *Td, const int *PA, - int *v1, int *v2, int *v3) { - int *t; - if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); } - if(Td[PA[*v2]] > Td[PA[*v3]]) { - if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; } - else { return v3; } - } - return v2; -} - -/* Returns the median of five elements. */ -static INLINE -int * -ss_median5(const unsigned char *Td, const int *PA, - int *v1, int *v2, int *v3, int *v4, int *v5) { - int *t; - if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); } - if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); } - if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); } - if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); } - if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); } - if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; } - return v3; -} - -/* Returns the pivot element. */ -static INLINE -int * -ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) { - int *middle; - int t; - - t = last - first; - middle = first + t / 2; - - if(t <= 512) { - if(t <= 32) { - return ss_median3(Td, PA, first, middle, last - 1); - } else { - t >>= 2; - return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1); - } - } - t >>= 3; - first = ss_median3(Td, PA, first, first + t, first + (t << 1)); - middle = ss_median3(Td, PA, middle - t, middle, middle + t); - last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1); - return ss_median3(Td, PA, first, middle, last); -} - - -/*---------------------------------------------------------------------------*/ - -/* Binary partition for substrings. */ -static INLINE -int * -ss_partition(const int *PA, - int *first, int *last, int depth) { - int *a, *b; - int t; - for(a = first - 1, b = last;;) { - for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; } - for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { } - if(b <= a) { break; } - t = ~*b; - *b = *a; - *a = t; - } - if(first < a) { *first = ~*first; } - return a; -} - -/* Multikey introsort for medium size groups. */ -static -void -ss_mintrosort(const unsigned char *T, const int *PA, - int *first, int *last, - int depth) { -#define STACK_SIZE SS_MISORT_STACKSIZE - struct { int *a, *b, c; int d; } stack[STACK_SIZE]; - const unsigned char *Td; - int *a, *b, *c, *d, *e, *f; - int s, t; - int ssize; - int limit; - int v, x = 0; - - for(ssize = 0, limit = ss_ilg(last - first);;) { - - if((last - first) <= SS_INSERTIONSORT_THRESHOLD) { -#if 1 < SS_INSERTIONSORT_THRESHOLD - if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); } -#endif - STACK_POP(first, last, depth, limit); - continue; - } - - Td = T + depth; - if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); } - if(limit < 0) { - for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) { - if((x = Td[PA[*a]]) != v) { - if(1 < (a - first)) { break; } - v = x; - first = a; - } - } - if(Td[PA[*first] - 1] < v) { - first = ss_partition(PA, first, a, depth); - } - if((a - first) <= (last - a)) { - if(1 < (a - first)) { - STACK_PUSH(a, last, depth, -1); - last = a, depth += 1, limit = ss_ilg(a - first); - } else { - first = a, limit = -1; - } - } else { - if(1 < (last - a)) { - STACK_PUSH(first, a, depth + 1, ss_ilg(a - first)); - first = a, limit = -1; - } else { - last = a, depth += 1, limit = ss_ilg(a - first); - } - } - continue; - } - - /* choose pivot */ - a = ss_pivot(Td, PA, first, last); - v = Td[PA[*a]]; - SWAP(*first, *a); - - /* partition */ - for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { } - if(((a = b) < last) && (x < v)) { - for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - } - for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { } - if((b < (d = c)) && (x > v)) { - for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - for(; b < c;) { - SWAP(*b, *c); - for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - - if(a <= d) { - c = b - 1; - - if((s = a - first) > (t = b - a)) { s = t; } - for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - if((s = d - c) > (t = last - d - 1)) { s = t; } - for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - - a = first + (b - a), c = last - (d - c); - b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth); - - if((a - first) <= (last - c)) { - if((last - c) <= (c - b)) { - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - STACK_PUSH(c, last, depth, limit); - last = a; - } else if((a - first) <= (c - b)) { - STACK_PUSH(c, last, depth, limit); - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - last = a; - } else { - STACK_PUSH(c, last, depth, limit); - STACK_PUSH(first, a, depth, limit); - first = b, last = c, depth += 1, limit = ss_ilg(c - b); - } - } else { - if((a - first) <= (c - b)) { - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - STACK_PUSH(first, a, depth, limit); - first = c; - } else if((last - c) <= (c - b)) { - STACK_PUSH(first, a, depth, limit); - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - first = c; - } else { - STACK_PUSH(first, a, depth, limit); - STACK_PUSH(c, last, depth, limit); - first = b, last = c, depth += 1, limit = ss_ilg(c - b); - } - } - } else { - limit += 1; - if(Td[PA[*first] - 1] < v) { - first = ss_partition(PA, first, last, depth); - limit = ss_ilg(last - first); - } - depth += 1; - } - } -#undef STACK_SIZE -} - -#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ - - -/*---------------------------------------------------------------------------*/ - -#if SS_BLOCKSIZE != 0 - -static INLINE -void -ss_blockswap(int *a, int *b, int n) { - int t; - for(; 0 < n; --n, ++a, ++b) { - t = *a, *a = *b, *b = t; - } -} - -static INLINE -void -ss_rotate(int *first, int *middle, int *last) { - int *a, *b, t; - int l, r; - l = middle - first, r = last - middle; - for(; (0 < l) && (0 < r);) { - if(l == r) { ss_blockswap(first, middle, l); break; } - if(l < r) { - a = last - 1, b = middle - 1; - t = *a; - do { - *a-- = *b, *b-- = *a; - if(b < first) { - *a = t; - last = a; - if((r -= l + 1) <= l) { break; } - a -= 1, b = middle - 1; - t = *a; - } - } while(1); - } else { - a = first, b = middle; - t = *a; - do { - *a++ = *b, *b++ = *a; - if(last <= b) { - *a = t; - first = a + 1; - if((l -= r + 1) <= r) { break; } - a += 1, b = middle; - t = *a; - } - } while(1); - } - } -} - - -/*---------------------------------------------------------------------------*/ - -static -void -ss_inplacemerge(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int depth) { - const int *p; - int *a, *b; - int len, half; - int q, r; - int x; - - for(;;) { - if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); } - else { x = 0; p = PA + *(last - 1); } - for(a = first, len = middle - first, half = len >> 1, r = -1; - 0 < len; - len = half, half >>= 1) { - b = a + half; - q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth); - if(q < 0) { - a = b + 1; - half -= (len & 1) ^ 1; - } else { - r = q; - } - } - if(a < middle) { - if(r == 0) { *a = ~*a; } - ss_rotate(a, middle, last); - last -= middle - a; - middle = a; - if(first == middle) { break; } - } - --last; - if(x != 0) { while(*--last < 0) { } } - if(middle == last) { break; } - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Merge-forward with internal buffer. */ -static -void -ss_mergeforward(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int depth) { - int *a, *b, *c, *bufend; - int t; - int r; - - bufend = buf + (middle - first) - 1; - ss_blockswap(buf, first, middle - first); - - for(t = *(a = first), b = buf, c = middle;;) { - r = ss_compare(T, PA + *b, PA + *c, depth); - if(r < 0) { - do { - *a++ = *b; - if(bufend <= b) { *bufend = t; return; } - *b++ = *a; - } while(*b < 0); - } else if(r > 0) { - do { - *a++ = *c, *c++ = *a; - if(last <= c) { - while(b < bufend) { *a++ = *b, *b++ = *a; } - *a = *b, *b = t; - return; - } - } while(*c < 0); - } else { - *c = ~*c; - do { - *a++ = *b; - if(bufend <= b) { *bufend = t; return; } - *b++ = *a; - } while(*b < 0); - - do { - *a++ = *c, *c++ = *a; - if(last <= c) { - while(b < bufend) { *a++ = *b, *b++ = *a; } - *a = *b, *b = t; - return; - } - } while(*c < 0); - } - } -} - -/* Merge-backward with internal buffer. */ -static -void -ss_mergebackward(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int depth) { - const int *p1, *p2; - int *a, *b, *c, *bufend; - int t; - int r; - int x; - - bufend = buf + (last - middle) - 1; - ss_blockswap(buf, middle, last - middle); - - x = 0; - if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; } - else { p1 = PA + *bufend; } - if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; } - else { p2 = PA + *(middle - 1); } - for(t = *(a = last - 1), b = bufend, c = middle - 1;;) { - r = ss_compare(T, p1, p2, depth); - if(0 < r) { - if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } - *a-- = *b; - if(b <= buf) { *buf = t; break; } - *b-- = *a; - if(*b < 0) { p1 = PA + ~*b; x |= 1; } - else { p1 = PA + *b; } - } else if(r < 0) { - if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } - *a-- = *c, *c-- = *a; - if(c < first) { - while(buf < b) { *a-- = *b, *b-- = *a; } - *a = *b, *b = t; - break; - } - if(*c < 0) { p2 = PA + ~*c; x |= 2; } - else { p2 = PA + *c; } - } else { - if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } - *a-- = ~*b; - if(b <= buf) { *buf = t; break; } - *b-- = *a; - if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } - *a-- = *c, *c-- = *a; - if(c < first) { - while(buf < b) { *a-- = *b, *b-- = *a; } - *a = *b, *b = t; - break; - } - if(*b < 0) { p1 = PA + ~*b; x |= 1; } - else { p1 = PA + *b; } - if(*c < 0) { p2 = PA + ~*c; x |= 2; } - else { p2 = PA + *c; } - } - } -} - -/* D&C based merge. */ -static -void -ss_swapmerge(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int bufsize, int depth) { -#define STACK_SIZE SS_SMERGE_STACKSIZE -#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) -#define MERGE_CHECK(a, b, c)\ - do {\ - if(((c) & 1) ||\ - (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\ - *(a) = ~*(a);\ - }\ - if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\ - *(b) = ~*(b);\ - }\ - } while(0) - struct { int *a, *b, *c; int d; } stack[STACK_SIZE]; - int *l, *r, *lm, *rm; - int m, len, half; - int ssize; - int check, next; - - for(check = 0, ssize = 0;;) { - if((last - middle) <= bufsize) { - if((first < middle) && (middle < last)) { - ss_mergebackward(T, PA, first, middle, last, buf, depth); - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - continue; - } - - if((middle - first) <= bufsize) { - if(first < middle) { - ss_mergeforward(T, PA, first, middle, last, buf, depth); - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - continue; - } - - for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1; - 0 < len; - len = half, half >>= 1) { - if(ss_compare(T, PA + GETIDX(*(middle + m + half)), - PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { - m += half + 1; - half -= (len & 1) ^ 1; - } - } - - if(0 < m) { - lm = middle - m, rm = middle + m; - ss_blockswap(lm, middle, m); - l = r = middle, next = 0; - if(rm < last) { - if(*rm < 0) { - *rm = ~*rm; - if(first < lm) { for(; *--l < 0;) { } next |= 4; } - next |= 1; - } else if(first < lm) { - for(; *r < 0; ++r) { } - next |= 2; - } - } - - if((l - first) <= (last - r)) { - STACK_PUSH(r, rm, last, (next & 3) | (check & 4)); - middle = lm, last = l, check = (check & 3) | (next & 4); - } else { - if((next & 2) && (r == middle)) { next ^= 6; } - STACK_PUSH(first, lm, l, (check & 3) | (next & 4)); - first = r, middle = rm, check = (next & 3) | (check & 4); - } - } else { - if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) { - *middle = ~*middle; - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - } - } -#undef STACK_SIZE -} - -#endif /* SS_BLOCKSIZE != 0 */ - - -/*---------------------------------------------------------------------------*/ - -/* Substring sort */ -static -void -sssort(const unsigned char *T, const int *PA, - int *first, int *last, - int *buf, int bufsize, - int depth, int n, int lastsuffix) { - int *a; -#if SS_BLOCKSIZE != 0 - int *b, *middle, *curbuf; - int j, k, curbufsize, limit; -#endif - int i; - - if(lastsuffix != 0) { ++first; } - -#if SS_BLOCKSIZE == 0 - ss_mintrosort(T, PA, first, last, depth); -#else - if((bufsize < SS_BLOCKSIZE) && - (bufsize < (last - first)) && - (bufsize < (limit = ss_isqrt(last - first)))) { - if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; } - buf = middle = last - limit, bufsize = limit; - } else { - middle = last, limit = 0; - } - for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) { -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth); -#endif - curbufsize = last - (a + SS_BLOCKSIZE); - curbuf = a + SS_BLOCKSIZE; - if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; } - for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) { - ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); - } - } -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, a, middle, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, a, middle, depth); -#endif - for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { - if(i & 1) { - ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth); - a -= k; - } - } - if(limit != 0) { -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, middle, last, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, middle, last, depth); -#endif - ss_inplacemerge(T, PA, first, middle, last, depth); - } -#endif - - if(lastsuffix != 0) { - /* Insert last type B* suffix. */ - int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2; - for(a = first, i = *(first - 1); - (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth))); - ++a) { - *(a - 1) = *a; - } - *(a - 1) = i; - } -} - - -/*---------------------------------------------------------------------------*/ - -static INLINE -int -tr_ilg(int n) { - return (n & 0xffff0000) ? - ((n & 0xff000000) ? - 24 + lg_table[(n >> 24) & 0xff] : - 16 + lg_table[(n >> 16) & 0xff]) : - ((n & 0x0000ff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]); -} - - -/*---------------------------------------------------------------------------*/ - -/* Simple insertionsort for small size groups. */ -static -void -tr_insertionsort(const int *ISAd, int *first, int *last) { - int *a, *b; - int t, r; - - for(a = first + 1; a < last; ++a) { - for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { - do { *(b + 1) = *b; } while((first <= --b) && (*b < 0)); - if(b < first) { break; } - } - if(r == 0) { *b = ~*b; } - *(b + 1) = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -static INLINE -void -tr_fixdown(const int *ISAd, int *SA, int i, int size) { - int j, k; - int v; - int c, d, e; - - for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { - d = ISAd[SA[k = j++]]; - if(d < (e = ISAd[SA[j]])) { k = j; d = e; } - if(d <= c) { break; } - } - SA[i] = v; -} - -/* Simple top-down heapsort. */ -static -void -tr_heapsort(const int *ISAd, int *SA, int size) { - int i, m; - int t; - - m = size; - if((size % 2) == 0) { - m--; - if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); } - } - - for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); } - if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); } - for(i = m - 1; 0 < i; --i) { - t = SA[0], SA[0] = SA[i]; - tr_fixdown(ISAd, SA, 0, i); - SA[i] = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Returns the median of three elements. */ -static INLINE -int * -tr_median3(const int *ISAd, int *v1, int *v2, int *v3) { - int *t; - if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); } - if(ISAd[*v2] > ISAd[*v3]) { - if(ISAd[*v1] > ISAd[*v3]) { return v1; } - else { return v3; } - } - return v2; -} - -/* Returns the median of five elements. */ -static INLINE -int * -tr_median5(const int *ISAd, - int *v1, int *v2, int *v3, int *v4, int *v5) { - int *t; - if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); } - if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); } - if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); } - if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); } - if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); } - if(ISAd[*v3] > ISAd[*v4]) { return v4; } - return v3; -} - -/* Returns the pivot element. */ -static INLINE -int * -tr_pivot(const int *ISAd, int *first, int *last) { - int *middle; - int t; - - t = last - first; - middle = first + t / 2; - - if(t <= 512) { - if(t <= 32) { - return tr_median3(ISAd, first, middle, last - 1); - } else { - t >>= 2; - return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1); - } - } - t >>= 3; - first = tr_median3(ISAd, first, first + t, first + (t << 1)); - middle = tr_median3(ISAd, middle - t, middle, middle + t); - last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1); - return tr_median3(ISAd, first, middle, last); -} - - -/*---------------------------------------------------------------------------*/ - -typedef struct _trbudget_t trbudget_t; -struct _trbudget_t { - int chance; - int remain; - int incval; - int count; -}; - -static INLINE -void -trbudget_init(trbudget_t *budget, int chance, int incval) { - budget->chance = chance; - budget->remain = budget->incval = incval; -} - -static INLINE -int -trbudget_check(trbudget_t *budget, int size) { - if(size <= budget->remain) { budget->remain -= size; return 1; } - if(budget->chance == 0) { budget->count += size; return 0; } - budget->remain += budget->incval - size; - budget->chance -= 1; - return 1; -} - - -/*---------------------------------------------------------------------------*/ - -static INLINE -void -tr_partition(const int *ISAd, - int *first, int *middle, int *last, - int **pa, int **pb, int v) { - int *a, *b, *c, *d, *e, *f; - int t, s; - int x = 0; - - for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { } - if(((a = b) < last) && (x < v)) { - for(; (++b < last) && ((x = ISAd[*b]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - } - for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } - if((b < (d = c)) && (x > v)) { - for(; (b < --c) && ((x = ISAd[*c]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - for(; b < c;) { - SWAP(*b, *c); - for(; (++b < c) && ((x = ISAd[*b]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - for(; (b < --c) && ((x = ISAd[*c]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - - if(a <= d) { - c = b - 1; - if((s = a - first) > (t = b - a)) { s = t; } - for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - if((s = d - c) > (t = last - d - 1)) { s = t; } - for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - first += (b - a), last -= (d - c); - } - *pa = first, *pb = last; -} - -static -void -tr_copy(int *ISA, const int *SA, - int *first, int *a, int *b, int *last, - int depth) { - /* sort suffixes of middle partition - by using sorted order of suffixes of left and right partition. */ - int *c, *d, *e; - int s, v; - - v = b - SA - 1; - for(c = first, d = a - 1; c <= d; ++c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *++d = s; - ISA[s] = d - SA; - } - } - for(c = last - 1, e = d + 1, d = b; e < d; --c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *--d = s; - ISA[s] = d - SA; - } - } -} - -static -void -tr_partialcopy(int *ISA, const int *SA, - int *first, int *a, int *b, int *last, - int depth) { - int *c, *d, *e; - int s, v; - int rank, lastrank, newrank = -1; - - v = b - SA - 1; - lastrank = -1; - for(c = first, d = a - 1; c <= d; ++c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *++d = s; - rank = ISA[s + depth]; - if(lastrank != rank) { lastrank = rank; newrank = d - SA; } - ISA[s] = newrank; - } - } - - lastrank = -1; - for(e = d; first <= e; --e) { - rank = ISA[*e]; - if(lastrank != rank) { lastrank = rank; newrank = e - SA; } - if(newrank != rank) { ISA[*e] = newrank; } - } - - lastrank = -1; - for(c = last - 1, e = d + 1, d = b; e < d; --c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *--d = s; - rank = ISA[s + depth]; - if(lastrank != rank) { lastrank = rank; newrank = d - SA; } - ISA[s] = newrank; - } - } -} - -static -void -tr_introsort(int *ISA, const int *ISAd, - int *SA, int *first, int *last, - trbudget_t *budget) { -#define STACK_SIZE TR_STACKSIZE - struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE]; - int *a, *b, *c; - int t; - int v, x = 0; - int incr = ISAd - ISA; - int limit, next; - int ssize, trlink = -1; - - for(ssize = 0, limit = tr_ilg(last - first);;) { - - if(limit < 0) { - if(limit == -1) { - /* tandem repeat partition */ - tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1); - - /* update ranks */ - if(a < last) { - for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } - } - if(b < last) { - for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } - } - - /* push */ - if(1 < (b - a)) { - STACK_PUSH5(NULL, a, b, 0, 0); - STACK_PUSH5(ISAd - incr, first, last, -2, trlink); - trlink = ssize - 2; - } - if((a - first) <= (last - b)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink); - last = a, limit = tr_ilg(a - first); - } else if(1 < (last - b)) { - first = b, limit = tr_ilg(last - b); - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } else { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink); - first = b, limit = tr_ilg(last - b); - } else if(1 < (a - first)) { - last = a, limit = tr_ilg(a - first); - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } else if(limit == -2) { - /* tandem repeat copy */ - a = stack[--ssize].b, b = stack[ssize].c; - if(stack[ssize].d == 0) { - tr_copy(ISA, SA, first, a, b, last, ISAd - ISA); - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA); - } - STACK_POP5(ISAd, first, last, limit, trlink); - } else { - /* sorted partition */ - if(0 <= *first) { - a = first; - do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); - first = a; - } - if(first < last) { - a = first; do { *a = ~*a; } while(*++a < 0); - next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1; - if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } } - - /* push */ - if(trbudget_check(budget, a - first)) { - if((a - first) <= (last - a)) { - STACK_PUSH5(ISAd, a, last, -3, trlink); - ISAd += incr, last = a, limit = next; - } else { - if(1 < (last - a)) { - STACK_PUSH5(ISAd + incr, first, a, next, trlink); - first = a, limit = -3; - } else { - ISAd += incr, last = a, limit = next; - } - } - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - if(1 < (last - a)) { - first = a, limit = -3; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - continue; - } - - if((last - first) <= TR_INSERTIONSORT_THRESHOLD) { - tr_insertionsort(ISAd, first, last); - limit = -3; - continue; - } - - if(limit-- == 0) { - tr_heapsort(ISAd, first, last - first); - for(a = last - 1; first < a; a = b) { - for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } - } - limit = -3; - continue; - } - - /* choose pivot */ - a = tr_pivot(ISAd, first, last); - SWAP(*first, *a); - v = ISAd[*first]; - - /* partition */ - tr_partition(ISAd, first, first + 1, last, &a, &b, v); - if((last - first) != (b - a)) { - next = (ISA[*a] != v) ? tr_ilg(b - a) : -1; - - /* update ranks */ - for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } - if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } - - /* push */ - if((1 < (b - a)) && (trbudget_check(budget, b - a))) { - if((a - first) <= (last - b)) { - if((last - b) <= (b - a)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - STACK_PUSH5(ISAd, b, last, limit, trlink); - last = a; - } else if(1 < (last - b)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - first = b; - } else { - ISAd += incr, first = a, last = b, limit = next; - } - } else if((a - first) <= (b - a)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, limit, trlink); - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - last = a; - } else { - STACK_PUSH5(ISAd, b, last, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - STACK_PUSH5(ISAd, b, last, limit, trlink); - STACK_PUSH5(ISAd, first, a, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - if((a - first) <= (b - a)) { - if(1 < (last - b)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - STACK_PUSH5(ISAd, first, a, limit, trlink); - first = b; - } else if(1 < (a - first)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - last = a; - } else { - ISAd += incr, first = a, last = b, limit = next; - } - } else if((last - b) <= (b - a)) { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, limit, trlink); - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - first = b; - } else { - STACK_PUSH5(ISAd, first, a, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - STACK_PUSH5(ISAd, first, a, limit, trlink); - STACK_PUSH5(ISAd, b, last, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } - } else { - if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; } - if((a - first) <= (last - b)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, limit, trlink); - last = a; - } else if(1 < (last - b)) { - first = b; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } else { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, limit, trlink); - first = b; - } else if(1 < (a - first)) { - last = a; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } - } else { - if(trbudget_check(budget, last - first)) { - limit = tr_ilg(last - first), ISAd += incr; - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } -#undef STACK_SIZE -} - - - -/*---------------------------------------------------------------------------*/ - -/* Tandem repeat sort */ -static -void -trsort(int *ISA, int *SA, int n, int depth) { - int *ISAd; - int *first, *last; - trbudget_t budget; - int t, skip, unsorted; - - trbudget_init(&budget, tr_ilg(n) * 2 / 3, n); -/* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */ - for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) { - first = SA; - skip = 0; - unsorted = 0; - do { - if((t = *first) < 0) { first -= t; skip += t; } - else { - if(skip != 0) { *(first + skip) = skip; skip = 0; } - last = SA + ISA[t] + 1; - if(1 < (last - first)) { - budget.count = 0; - tr_introsort(ISA, ISAd, SA, first, last, &budget); - if(budget.count != 0) { unsorted += budget.count; } - else { skip = first - last; } - } else if((last - first) == 1) { - skip = -1; - } - first = last; - } - } while(first < (SA + n)); - if(skip != 0) { *(first + skip) = skip; } - if(unsorted == 0) { break; } - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Sorts suffixes of type B*. */ -static -int -sort_typeBstar(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n) { - int *PAb, *ISAb, *buf; -#ifdef _OPENMP - int *curbuf; - int l; -#endif - int i, j, k, t, m, bufsize; - int c0, c1; -#ifdef _OPENMP - int d0, d1; - int tmp; -#endif - - /* Initialize bucket arrays. */ - for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } - for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } - - /* Count the number of occurrences of the first one or two characters of each - type A, B and B* suffix. Moreover, store the beginning position of all - type B* suffixes into the array SA. */ - for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { - /* type A suffix. */ - do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); - if(0 <= i) { - /* type B* suffix. */ - ++BUCKET_BSTAR(c0, c1); - SA[--m] = i; - /* type B suffix. */ - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { - ++BUCKET_B(c0, c1); - } - } - } - m = n - m; -/* -note: - A type B* suffix is lexicographically smaller than a type B suffix that - begins with the same first two characters. -*/ - - /* Calculate the index of start/end point of each bucket. */ - for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { - t = i + BUCKET_A(c0); - BUCKET_A(c0) = i + j; /* start point */ - i = t + BUCKET_B(c0, c0); - for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { - j += BUCKET_BSTAR(c0, c1); - BUCKET_BSTAR(c0, c1) = j; /* end point */ - i += BUCKET_B(c0, c1); - } - } - - if(0 < m) { - /* Sort the type B* suffixes by their first two characters. */ - PAb = SA + n - m; ISAb = SA + m; - for(i = m - 2; 0 <= i; --i) { - t = PAb[i], c0 = T[t], c1 = T[t + 1]; - SA[--BUCKET_BSTAR(c0, c1)] = i; - } - t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; - SA[--BUCKET_BSTAR(c0, c1)] = m - 1; - - /* Sort the type B* substrings using sssort. */ -#ifdef _OPENMP - tmp = omp_get_max_threads(); - buf = SA + m, bufsize = (n - (2 * m)) / tmp; - c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m; -#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp) - { - tmp = omp_get_thread_num(); - curbuf = buf + tmp * bufsize; - k = 0; - for(;;) { - #pragma omp critical(sssort_lock) - { - if(0 < (l = j)) { - d0 = c0, d1 = c1; - do { - k = BUCKET_BSTAR(d0, d1); - if(--d1 <= d0) { - d1 = ALPHABET_SIZE - 1; - if(--d0 < 0) { break; } - } - } while(((l - k) <= 1) && (0 < (l = k))); - c0 = d0, c1 = d1, j = k; - } - } - if(l == 0) { break; } - sssort(T, PAb, SA + k, SA + l, - curbuf, bufsize, 2, n, *(SA + k) == (m - 1)); - } - } -#else - buf = SA + m, bufsize = n - (2 * m); - for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { - for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { - i = BUCKET_BSTAR(c0, c1); - if(1 < (j - i)) { - sssort(T, PAb, SA + i, SA + j, - buf, bufsize, 2, n, *(SA + i) == (m - 1)); - } - } - } -#endif - - /* Compute ranks of type B* substrings. */ - for(i = m - 1; 0 <= i; --i) { - if(0 <= SA[i]) { - j = i; - do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); - SA[i + 1] = i - j; - if(i <= 0) { break; } - } - j = i; - do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); - ISAb[SA[i]] = j; - } - - /* Construct the inverse suffix array of type B* suffixes using trsort. */ - trsort(ISAb, SA, m, 1); - - /* Set the sorted order of tyoe B* suffixes. */ - for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } - if(0 <= i) { - t = i; - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } - SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; - } - } - - /* Calculate the index of start/end point of each bucket. */ - BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ - for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { - i = BUCKET_A(c0 + 1) - 1; - for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { - t = i - BUCKET_B(c0, c1); - BUCKET_B(c0, c1) = i; /* end point */ - - /* Move all type B* suffixes to the correct position. */ - for(i = t, j = BUCKET_BSTAR(c0, c1); - j <= k; - --i, --k) { SA[i] = SA[k]; } - } - BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ - BUCKET_B(c0, c0) = i; /* end point */ - } - } - - return m; -} - -/* Constructs the suffix array by using the sorted order of type B* suffixes. */ -static -void -construct_SA(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n, int m) { - int *i, *j, *k; - int s; - int c0, c1, c2; - - if(0 < m) { - /* Construct the sorted order of type B suffixes by using - the sorted order of type B* suffixes. */ - for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { - /* Scan the suffix array from right to left. */ - for(i = SA + BUCKET_BSTAR(c1, c1 + 1), - j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; - i <= j; - --j) { - if(0 < (s = *j)) { - assert(T[s] == c1); - assert(((s + 1) < n) && (T[s] <= T[s + 1])); - assert(T[s - 1] <= T[s]); - *j = ~s; - c0 = T[--s]; - if((0 < s) && (T[s - 1] > c0)) { s = ~s; } - if(c0 != c2) { - if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } - k = SA + BUCKET_B(c2 = c0, c1); - } - assert(k < j); - *k-- = s; - } else { - assert(((s == 0) && (T[s] == c1)) || (s < 0)); - *j = ~s; - } - } - } - } - - /* Construct the suffix array by using - the sorted order of type B suffixes. */ - k = SA + BUCKET_A(c2 = T[n - 1]); - *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); - /* Scan the suffix array from left to right. */ - for(i = SA, j = SA + n; i < j; ++i) { - if(0 < (s = *i)) { - assert(T[s - 1] >= T[s]); - c0 = T[--s]; - if((s == 0) || (T[s - 1] < c0)) { s = ~s; } - if(c0 != c2) { - BUCKET_A(c2) = k - SA; - k = SA + BUCKET_A(c2 = c0); - } - assert(i < k); - *k++ = s; - } else { - assert(s < 0); - *i = ~s; - } - } -} - -/* Constructs the burrows-wheeler transformed string directly - by using the sorted order of type B* suffixes. */ -static -int -construct_BWT(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n, int m) { - int *i, *j, *k, *orig; - int s; - int c0, c1, c2; - - if(0 < m) { - /* Construct the sorted order of type B suffixes by using - the sorted order of type B* suffixes. */ - for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { - /* Scan the suffix array from right to left. */ - for(i = SA + BUCKET_BSTAR(c1, c1 + 1), - j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; - i <= j; - --j) { - if(0 < (s = *j)) { - assert(T[s] == c1); - assert(((s + 1) < n) && (T[s] <= T[s + 1])); - assert(T[s - 1] <= T[s]); - c0 = T[--s]; - *j = ~((int)c0); - if((0 < s) && (T[s - 1] > c0)) { s = ~s; } - if(c0 != c2) { - if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } - k = SA + BUCKET_B(c2 = c0, c1); - } - assert(k < j); - *k-- = s; - } else if(s != 0) { - *j = ~s; -#ifndef NDEBUG - } else { - assert(T[s] == c1); -#endif - } - } - } - } - - /* Construct the BWTed string by using - the sorted order of type B suffixes. */ - k = SA + BUCKET_A(c2 = T[n - 1]); - *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1); - /* Scan the suffix array from left to right. */ - for(i = SA, j = SA + n, orig = SA; i < j; ++i) { - if(0 < (s = *i)) { - assert(T[s - 1] >= T[s]); - c0 = T[--s]; - *i = c0; - if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); } - if(c0 != c2) { - BUCKET_A(c2) = k - SA; - k = SA + BUCKET_A(c2 = c0); - } - assert(i < k); - *k++ = s; - } else if(s != 0) { - *i = ~s; - } else { - orig = i; - } - } - - return orig - SA; -} - - -/*---------------------------------------------------------------------------*/ - -/*- Function -*/ - -int -divsufsort(const unsigned char *T, int *SA, int n) { - int *bucket_A, *bucket_B; - int m; - int err = 0; - - /* Check arguments. */ - if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } - else if(n == 0) { return 0; } - else if(n == 1) { SA[0] = 0; return 0; } - else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } - - bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); - bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); - - /* Suffixsort. */ - if((bucket_A != NULL) && (bucket_B != NULL)) { - m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); - construct_SA(T, SA, bucket_A, bucket_B, n, m); - } else { - err = -2; - } - - free(bucket_B); - free(bucket_A); - - return err; -} - -int -divbwt(const unsigned char *T, unsigned char *U, int *A, int n) { - int *B; - int *bucket_A, *bucket_B; - int m, pidx, i; - - /* Check arguments. */ - if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } - else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - - if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); } - bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); - bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); - - /* Burrows-Wheeler Transform. */ - if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { - m = sort_typeBstar(T, B, bucket_A, bucket_B, n); - pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); - - /* Copy to output string. */ - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; } - for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; } - pidx += 1; - } else { - pidx = -2; - } - - free(bucket_B); - free(bucket_A); - if(A == NULL) { free(B); } - - return pidx; -} - -// End divsufsort.c - -/////////////////////////////// add /////////////////////////////////// - -// Convert non-negative decimal number x to string of at least n digits -std::string itos(int64_t x, int n=1) { - assert(x>=0); - assert(n>=0); - std::string r; - for (; x || n>0; x/=10, --n) r=std::string(1, '0'+x%10)+r; - return r; -} - -// E8E9 transform of buf[0..n-1] to improve compression of .exe and .dll. -// Patterns (E8|E9 xx xx xx 00|FF) at offset i replace the 3 middle -// bytes with x+i mod 2^24, LSB first, reading backward. -void e8e9(unsigned char* buf, int n) { - for (int i=n-5; i>=0; --i) { - if (((buf[i]&254)==0xe8) && ((buf[i+4]+1)&254)==0) { - unsigned a=(buf[i+1]|buf[i+2]<<8|buf[i+3]<<16)+i; - buf[i+1]=a; - buf[i+2]=a>>8; - buf[i+3]=a>>16; - } - } -} - -// Encode inbuf to buf using LZ77. args are as follows: -// args[0] is log2 buffer size in MB. -// args[1] is level (1=var. length, 2=byte aligned lz77, 3=bwt) + 4 if E8E9. -// args[2] is the lz77 minimum match length and context order. -// args[3] is the lz77 higher context order to search first, or else 0. -// args[4] is the log2 hash bucket size (number of searches). -// args[5] is the log2 hash table size. If 21+args[0] then use a suffix array. -// args[6] is the secondary context look ahead -// sap is pointer to external suffix array of inbuf or 0. If supplied and -// args[0]=5..7 then it is assumed that E8E9 was already applied to -// both the input and sap and the input buffer is not modified. - -class LZBuffer: public libzpaq::Reader { - libzpaq::Array ht;// hash table, confirm in low bits, or SA+ISA - const unsigned char* in; // input pointer - const int checkbits; // hash confirmation size or lg(ISA size) - const int level; // 1=var length LZ77, 2=byte aligned LZ77, 3=BWT - const unsigned htsize; // size of hash table - const unsigned n; // input length - unsigned i; // current location in in (0 <= i < n) - const unsigned minMatch; // minimum match length - const unsigned minMatch2; // second context order or 0 if not used - const unsigned maxMatch; // longest match length allowed - const unsigned maxLiteral; // longest literal length allowed - const unsigned lookahead; // second context look ahead - unsigned h1, h2; // low, high order context hashes of in[i..] - const unsigned bucket; // number of matches to search per hash - 1 - const unsigned shift1, shift2; // how far to shift h1, h2 per hash - const int minMatchBoth; // max(minMatch, minMatch2) - const unsigned rb; // number of level 1 r bits in match code - unsigned bits; // pending output bits (level 1) - unsigned nbits; // number of bits in bits - unsigned rpos, wpos; // read, write pointers - unsigned idx; // BWT index - const unsigned* sa; // suffix array for BWT or LZ77-SA - unsigned* isa; // inverse suffix array for LZ77-SA - enum {BUFSIZE=1<<14}; // output buffer size - unsigned char buf[BUFSIZE]; // output buffer - - void write_literal(unsigned i, unsigned& lit); - void write_match(unsigned len, unsigned off); - void fill(); // encode to buf - - // write k bits of x - void putb(unsigned x, int k) { - x&=(1<7) { - assert(wpos>=8, nbits-=8; - } - } - - // write last byte - void flush() { - assert(wpos0) buf[wpos++]=bits; - bits=nbits=0; - } - - // write 1 byte - void put(int c) { - assert(wpos 00) = match 4*n+ll at offset (q<=65536) r=16, x>>=16; - if (x>=256) r+=8, x>>=8; - if (x>=16) r+=4, x>>=4; - assert(x>=0 && x<16); - return - "\x00\x01\x02\x02\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04"[x]+r; -} - -// return number of 1 bits in x -int nbits(unsigned x) { - int r; - for (r=0; x; x>>=1) r+=x&1; - return r; -} - -// Read n bytes of compressed output into p and return number of -// bytes read in 0..n. 0 signals EOF (overrides Reader). -int LZBuffer::read(char* p, int n) { - if (rpos==wpos) fill(); - int nr=n; - if (nr>int(wpos-rpos)) nr=wpos-rpos; - if (nr) memcpy(p, buf+rpos, nr); - rpos+=nr; - assert(rpos<=wpos); - if (rpos==wpos) rpos=wpos=0; - return nr; -} - -LZBuffer::LZBuffer(StringBuffer& inbuf, int args[], const unsigned* sap): - ht((args[1]&3)==3 ? (inbuf.size()+1)*!sap // for BWT suffix array - : args[5]-args[0]<21 ? 1u<0 ? (args[5]-1)/minMatch+1 : 1), - shift2(minMatch2>0 ? (args[5]-1)/minMatch2+1 : 0), - minMatchBoth(MAX(minMatch, minMatch2+lookahead)+4), - rb(args[0]>4 ? args[0]-4 : 0), - bits(0), nbits(0), rpos(0), wpos(0), - idx(0), sa(0), isa(0) { - assert(args[0]>=0); - assert(n<=(1u<<20<=1 && args[1]<=7 && args[1]!=4); - assert(level>=1 && level<=3); - if ((minMatch<4 && level==1) || (minMatch<1 && level==2)) - error("match length $3 too small"); - - // e8e9 transform - if (args[1]>4 && !sap) e8e9(inbuf.data(), n); - - // build suffix array if not supplied - if (args[5]-args[0]>=21 || level==3) { // LZ77-SA or BWT - if (sap) - sa=sap; - else { - assert(ht.size()>=n); - assert(ht.size()>0); - sa=&ht[0]; - if (n>0) divsufsort((const unsigned char*)in, (int*)sa, n); - } - if (level<3) { - assert(ht.size()>=(n*(sap==0))+(1u<<17<0 ? in[n-1] : 255); - else if (i>n) put(idx&255), idx>>=8; - else if (sa[i-1]==0) idx=i, put(255); - else put(in[sa[i-1]-1]); - } - return; - } - - // LZ77: scan the input - unsigned lit=0; // number of output literals pending - const unsigned mask=(1<0 && in[p+l1-1]==in[i+l1-1]; --l1); - int score=int(l-l1)*8-lg(i-p)-4*(lit==0 && l1>0)-11; - for (unsigned a=0; abscore) blen=l, bp=p, blit=l1, bscore=score; - if (l255) break; - } - } - } - if (bscore<=0 || blen0) { - for (unsigned k=0; k<=bucket; ++k) { - unsigned p=ht[h2^k]; - if (p && (p&mask)==(in[i+3]&mask)) { - p>>=checkbits; - if (p=minMatch2+lookahead) { - int l1; // length back from lookahead - for (l1=lookahead; l1>0 && in[p+l1-1]==in[i+l1-1]; --l1); - assert(l1>=0 && l1<=int(lookahead)); - int score=int(l-l1)*8-lg(i-p)-8*(lit==0 && l1>0)-11; - if (score>bscore) blen=l, bp=p, blit=l1, bscore=score; - } - } - } - if (blen>=128) break; - } - } - - // Search the lower order context - if (!minMatch2 || blen>=checkbits; - if (p0)-11; - if (score>bscore) blen=l, bp=p, blit=0, bscore=score; - } - } - if (blen>=128) break; - } - } - } - - // If match is long enough, then output any pending literals first, - // and then the match. blen is the length of the match. - assert(i>=bp); - const unsigned off=i-bp; // offset - if (off>0 && bscore>0 - && blen-blit>=minMatch+(level==2)*((off>=(1<<16))+(off>=(1<<24)))) { - lit+=blit; - write_literal(i+blit, lit); - write_match(blen-blit, off); - } - - // Otherwise add to literal length - else { - blen=1; - ++lit; - } - - // Update index, advance blen bytes - if (isa) - i+=blen; - else { - while (blen--) { - if (i+minMatchBoth>19)&bucket; - const unsigned p=(i<=maxLiteral) - write_literal(i, lit); - } - - // Write pending literals at end of input - assert(i<=n); - if (i==n) { - write_literal(n, lit); - flush(); - } -} - -// Write literal sequence in[i-lit..i-1], set lit=0 -void LZBuffer::write_literal(unsigned i, unsigned& lit) { - assert(lit>=0); - assert(i>=0 && i<=n); - assert(i>=lit); - if (level==1) { - if (lit<1) return; - int ll=lg(lit); - assert(ll>=1 && ll<=24); - putb(0, 2); - --ll; - while (--ll>=0) { - putb(1, 1); - putb((lit>>ll)&1, 1); - } - putb(0, 1); - while (lit) putb(in[i-lit--], 8); - } - else { - assert(level==2); - while (lit>0) { - unsigned lit1=lit; - if (lit1>64) lit1=64; - put(lit1-1); - for (unsigned j=i-lit; j=minMatch && len<=maxMatch); - assert(off>0); - assert(len>=4); - assert(rb>=0 && rb<=8); - int ll=lg(len)-1; - assert(ll>=2); - off+=(1<=0 && lo<=23); - putb((lo+8)>>3, 2);// mm - putb(lo&7, 3); // mmm - while (--ll>=2) { // n - putb(1, 1); - putb((len>>ll)&1, 1); - } - putb(0, 1); - putb(len&3, 2); // ll - putb(off, rb); // r - putb(off>>rb, lo); // q - } - - // x[2]:len[6] off[x-1] - else { - assert(level==2); - assert(minMatch>=1 && minMatch<=64); - --off; - while (len>0) { // Split long matches to len1=minMatch..minMatch+63 - const unsigned len1=len>minMatch*2+63 ? minMatch+63 : - len>minMatch+63 ? len-minMatch : len; - assert(wpos=minMatch && len1>8); - put(off); - } - else if (off<(1<<24)) { - put(128+len1-minMatch); - put(off>>16); - put(off>>8); - put(off); - } - else { - put(192+len1-minMatch); - put(off>>24); - put(off>>16); - put(off>>8); - put(off); - } - len-=len1; - } - } -} - -// Generate a config file from the method argument with syntax: -// {0|x|s|i}[N1[,N2]...][{ciamtswf}[N1[,N2]]...]... -std::string makeConfig(const char* method, int args[]) { - assert(method); - const char type=method[0]; - assert(type=='x' || type=='s' || type=='0' || type=='i'); - - // Read "{x|s|i|0}N1,N2...N9" into args[0..8] ($1..$9) - args[0]=0; // log block size in MiB - args[1]=0; // 0=none, 1=var-LZ77, 2=byte-LZ77, 3=BWT, 4..7 adds E8E9 - args[2]=0; // lz77 minimum match length - args[3]=0; // secondary context length - args[4]=0; // log searches - args[5]=0; // lz77 hash table size or SA if args[0]+21 - args[6]=0; // secondary context look ahead - args[7]=0; // not used - args[8]=0; // not used - if (isdigit(*++method)) args[0]=0; - for (int i=0; i<9 && (isdigit(*method) || *method==',' || *method=='.');) { - if (isdigit(*method)) - args[i]=args[i]*10+*method-'0'; - else if (++i<9) - args[i]=0; - ++method; - } - - // "0..." = No compression - if (type=='0') - return "comp 0 0 0 0 0 hcomp end\n"; - - // Generate the postprocessor - std::string hdr, pcomp; - const int level=args[1]&3; - const bool doe8=args[1]>=4 && args[1]<=7; - - // LZ77+Huffman, with or without E8E9 - if (level==1) { - const int rb=args[0]>4 ? args[0]-4 : 0; - hdr="comp 9 16 0 $1+20 "; - pcomp= - "pcomp lazy2 3 ;\n" - " (r1 = state\n" - " r2 = len - match or literal length\n" - " r3 = m - number of offset bits expected\n" - " r4 = ptr to buf\n" - " r5 = r - low bits of offset\n" - " c = bits - input buffer\n" - " d = n - number of bits in c)\n" - "\n" - " a> 255 if\n"; - if (doe8) - pcomp+= - " b=0 d=r 4 do (for b=0..d-1, d = end of buf)\n" - " a=b a==d ifnot\n" - " a+= 4 a>= 8 b++\n" - " *b=a a>>= 8 b++\n" - " *b=a b++\n" - " endif\n" - " b=c\n" - " endif\n" - " endif\n" - " a=*b out b++\n" - " forever\n" - " endif\n" - "\n"; - pcomp+= - " (reset state)\n" - " a=0 b=0 c=0 d=0 r=a 1 r=a 2 r=a 3 r=a 4\n" - " halt\n" - " endif\n" - "\n" - " a<<=d a+=c c=a (bits+=a< 0 if (if (bits&3))\n" - " a-- a<<= 3 r=a 3 (m=((bits&3)-1)*8)\n" - " a=c a>>= 2 c=a (bits>>=2)\n" - " b=r 3 a&= 7 a+=b r=a 3 (m+=bits&7)\n" - " a=c a>>= 3 c=a (bits>>=3)\n" - " a=d a-= 5 d=a (n-=5)\n" - " a= 1 r=a 1 (state=1)\n" - " else (literal, discard 00)\n" - " a=c a>>= 2 c=a (bits>>=2)\n" - " d-- d-- (n-=2)\n" - " a= 3 r=a 1 (state=3)\n" - " endif\n" - " endif\n" - "\n" - " (while state==1 && n>=3 (expect match length n*4+ll -> r2))\n" - " do a=r 1 a== 1 if a=d a> 2 if\n" - " a=c a&= 1 a== 1 if (if bits&1)\n" - " a=c a>>= 1 c=a (bits>>=1)\n" - " b=r 2 a=c a&= 1 a+=b a+=b r=a 2 (len+=len+(bits&1))\n" - " a=c a>>= 1 c=a (bits>>=1)\n" - " d-- d-- (n-=2)\n" - " else\n" - " a=c a>>= 1 c=a (bits>>=1)\n" - " a=r 2 a<<= 2 b=a (len<<=2)\n" - " a=c a&= 3 a+=b r=a 2 (len+=bits&3)\n" - " a=c a>>= 2 c=a (bits>>=2)\n" - " d-- d-- d-- (n-=3)\n"; - if (rb) - pcomp+=" a= 5 r=a 1 (state=5)\n"; - else - pcomp+=" a= 2 r=a 1 (state=2)\n"; - pcomp+= - " endif\n" - " forever endif endif\n" - "\n"; - if (rb) pcomp+= // save r in r5 - " (if state==5 && n>=8) (expect low bits of offset to put in r5)\n" - " a=r 1 a== 5 if a=d a> "+itos(rb-1)+" if\n" - " a=c a&= "+itos((1<>= "+itos(rb)+" c=a\n" - " a=d a-= "+itos(rb)+ " d=a\n" - " a= 2 r=a 1 (go to state 2)\n" - " endif endif\n" - "\n"; - pcomp+= - " (if state==2 && n>=m) (expect m offset bits)\n" - " a=r 1 a== 2 if a=r 3 a>d ifnot\n" - " a=c r=a 6 a=d r=a 7 (save c=bits, d=n in r6,r7)\n" - " b=r 3 a= 1 a<<=b d=a (d=1< 0 if d--\n" - " a=*c *b=a c++ b++ (buf[ptr++]-buf[p++])\n"; - if (!doe8) pcomp+=" out\n"; - pcomp+= - " forever endif\n" - " a=b r=a 4\n" - "\n" - " a=r 6 b=r 3 a>>=b c=a (bits>>=m)\n" - " a=r 7 a-=b d=a (n-=m)\n" - " a=0 r=a 1 (state=0)\n" - " endif endif\n" - "\n" - " (while state==3 && n>=2 (expect literal length))\n" - " do a=r 1 a== 3 if a=d a> 1 if\n" - " a=c a&= 1 a== 1 if (if bits&1)\n" - " a=c a>>= 1 c=a (bits>>=1)\n" - " b=r 2 a&= 1 a+=b a+=b r=a 2 (len+=len+(bits&1))\n" - " a=c a>>= 1 c=a (bits>>=1)\n" - " d-- d-- (n-=2)\n" - " else\n" - " a=c a>>= 1 c=a (bits>>=1)\n" - " d-- (--n)\n" - " a= 4 r=a 1 (state=4)\n" - " endif\n" - " forever endif endif\n" - "\n" - " (if state==4 && n>=8 (expect len literals))\n" - " a=r 1 a== 4 if a=d a> 7 if\n" - " b=r 4 a=c *b=a\n"; - if (!doe8) pcomp+=" out\n"; - pcomp+= - " b++ a=b r=a 4 (buf[ptr++]=bits)\n" - " a=c a>>= 8 c=a (bits>>=8)\n" - " a=d a-= 8 d=a (n-=8)\n" - " a=r 2 a-- r=a 2 a== 0 if (if --len<1)\n" - " a=0 r=a 1 (state=0)\n" - " endif\n" - " endif endif\n" - " halt\n" - "end\n"; - } - - // Byte aligned LZ77, with or without E8E9 - else if (level==2) { - hdr="comp 9 16 0 $1+20 "; - pcomp= - "pcomp lzpre c ;\n" - " (Decode LZ77: d=state, M=output buffer, b=size)\n" - " a> 255 if (at EOF decode e8e9 and output)\n"; - if (doe8) - pcomp+= - " d=b b=0 do (for b=0..d-1, d = end of buf)\n" - " a=b a==d ifnot\n" - " a+= 4 a>= 8 b++\n" - " *b=a a>>= 8 b++\n" - " *b=a b++\n" - " endif\n" - " b=c\n" - " endif\n" - " endif\n" - " a=*b out b++\n" - " forever\n" - " endif\n"; - pcomp+= - " b=0 c=0 d=0 a=0 r=a 1 r=a 2 (reset state)\n" - " halt\n" - " endif\n" - "\n" - " (in state d==0, expect a new code)\n" - " (put length in r1 and inital part of offset in r2)\n" - " c=a a=d a== 0 if\n" - " a=c a>>= 6 a++ d=a\n" - " a== 1 if (literal?)\n" - " a+=c r=a 1 a=0 r=a 2\n" - " else (3 to 5 byte match)\n" - " d++ a=c a&= 63 a+= $3 r=a 1 a=0 r=a 2\n" - " endif\n" - " else\n" - " a== 1 if (writing literal)\n" - " a=c *b=a b++\n"; - if (!doe8) pcomp+=" out\n"; - pcomp+= - " a=r 1 a-- a== 0 if d=0 endif r=a 1 (if (--len==0) state=0)\n" - " else\n" - " a> 2 if (reading offset)\n" - " a=r 2 a<<= 8 a|=c r=a 2 d-- (off=off<<8|c, --state)\n" - " else (state==2, write match)\n" - " a=r 2 a<<= 8 a|=c c=a a=b a-=c a-- c=a (c=i-off-1)\n" - " d=r 1 (d=len)\n" - " do (copy and output d=len bytes)\n" - " a=*c *b=a c++ b++\n"; - if (!doe8) pcomp+=" out\n"; - pcomp+= - " d-- a=d a> 0 while\n" - " (d=state=0. off, len don\'t matter)\n" - " endif\n" - " endif\n" - " endif\n" - " halt\n" - "end\n"; - } - - // BWT with or without E8E9 - else if (level==3) { // IBWT - hdr="comp 9 16 $1+20 $1+20 "; // 2^$1 = block size in MB - pcomp= - "pcomp bwtrle c ;\n" - "\n" - " (read BWT, index into M, size in b)\n" - " a> 255 ifnot\n" - " *b=a b++\n" - "\n" - " (inverse BWT)\n" - " elsel\n" - "\n" - " (index in last 4 bytes, put in c and R1)\n" - " b-- a=*b\n" - " b-- a<<= 8 a+=*b\n" - " b-- a<<= 8 a+=*b\n" - " b-- a<<= 8 a+=*b c=a r=a 1\n" - "\n" - " (save size in R2)\n" - " a=b r=a 2\n" - "\n" - " (count bytes in H[~1..~255, ~0])\n" - " do\n" - " a=b a> 0 if\n" - " b-- a=*b a++ a&= 255 d=a d! *d++\n" - " forever\n" - " endif\n" - "\n" - " (cumulative counts: H[~i=0..255] = count of bytes before i)\n" - " d=0 d! *d= 1 a=0\n" - " do\n" - " a+=*d *d=a d--\n" - " d<>a a! a> 255 a! d<>a until\n" - "\n" - " (build first part of linked list in H[0..idx-1])\n" - " b=0 do\n" - " a=c a>b if\n" - " d=*b d! *d++ d=*d d-- *d=b\n" - " b++ forever\n" - " endif\n" - "\n" - " (rest of list in H[idx+1..n-1])\n" - " b=c b++ c=r 2 do\n" - " a=c a>b if\n" - " d=*b d! *d++ d=*d d-- *d=b\n" - " b++ forever\n" - " endif\n" - "\n"; - if (args[0]<=4) { // faster IBWT list traversal limited to 16 MB blocks - pcomp+= - " (copy M to low 8 bits of H to reduce cache misses in next loop)\n" - " b=0 do\n" - " a=c a>b if\n" - " d=b a=*d a<<= 8 a+=*b *d=a\n" - " b++ forever\n" - " endif\n" - "\n" - " (traverse list and output or copy to M)\n" - " d=r 1 b=0 do\n" - " a=d a== 0 ifnot\n" - " a=*d a>>= 8 d=a\n"; - if (doe8) pcomp+=" *b=*d b++\n"; - else pcomp+=" a=*d out\n"; - pcomp+= - " forever\n" - " endif\n" - "\n"; - if (doe8) // IBWT+E8E9 - pcomp+= - " (e8e9 transform to out)\n" - " d=b b=0 do (for b=0..d-1, d = end of buf)\n" - " a=b a==d ifnot\n" - " a+= 4 a>= 8 b++\n" - " *b=a a>>= 8 b++\n" - " *b=a b++\n" - " endif\n" - " b=c\n" - " endif\n" - " endif\n" - " a=*b out b++\n" - " forever\n" - " endif\n"; - pcomp+= - " endif\n" - " halt\n" - "end\n"; - } - else { // slower IBWT list traversal for all sized blocks - if (doe8) { // E8E9 after IBWT - pcomp+= - " (R2 = output size without EOS)\n" - " a=r 2 a-- r=a 2\n" - "\n" - " (traverse list (d = IBWT pointer) and output inverse e8e9)\n" - " (C = offset = 0..R2-1)\n" - " (R4 = last 4 bytes shifted in from MSB end)\n" - " (R5 = temp pending output byte)\n" - " c=0 d=r 1 do\n" - " a=d a== 0 ifnot\n" - " d=*d\n" - "\n" - " (store byte in R4 and shift out to R5)\n" - " b=d a=*b a<<= 24 b=a\n" - " a=r 4 r=a 5 a>>= 8 a|=b r=a 4\n" - "\n" - " (if E8|E9 xx xx xx 00|FF in R4:R5 then subtract c from x)\n" - " a=c a> 3 if\n" - " a=r 5 a&= 254 a== 232 if\n" - " a=r 4 a>>= 24 b=a a++ a&= 254 a< 2 if\n" - " a=r 4 a-=c a+= 4 a<<= 8 a>>= 8 \n" - " b<>a a<<= 24 a+=b r=a 4\n" - " endif\n" - " endif\n" - " endif\n" - "\n" - " (output buffered byte)\n" - " a=c a> 3 if a=r 5 out endif c++\n" - "\n" - " forever\n" - " endif\n" - "\n" - " (output up to 4 pending bytes in R4)\n" - " b=r 4\n" - " a=c a> 3 a=b if out endif a>>= 8 b=a\n" - " a=c a> 2 a=b if out endif a>>= 8 b=a\n" - " a=c a> 1 a=b if out endif a>>= 8 b=a\n" - " a=c a> 0 a=b if out endif\n" - "\n" - " endif\n" - " halt\n" - "end\n"; - } - else { - pcomp+= - " (traverse list and output)\n" - " d=r 1 do\n" - " a=d a== 0 ifnot\n" - " d=*d\n" - " b=d a=*b out\n" - " forever\n" - " endif\n" - " endif\n" - " halt\n" - "end\n"; - } - } - } - - // E8E9 or no preprocessing - else if (level==0) { - hdr="comp 9 16 0 0 "; - if (doe8) { // E8E9? - pcomp= - "pcomp e8e9 d ;\n" - " a> 255 if\n" - " a=c a> 4 if\n" - " c= 4\n" - " else\n" - " a! a+= 5 a<<= 3 d=a a=b a>>=d b=a\n" - " endif\n" - " do a=c a> 0 if\n" - " a=b out a>>= 8 b=a c--\n" - " forever endif\n" - " else\n" - " *b=b a<<= 24 d=a a=b a>>= 8 a+=d b=a c++\n" - " a=c a> 4 if\n" - " a=*b out\n" - " a&= 254 a== 232 if\n" - " a=b a>>= 24 a++ a&= 254 a== 0 if\n" - " a=b a>>= 24 a<<= 24 d=a\n" - " a=b a-=c a+= 5\n" - " a<<= 8 a>>= 8 a|=d b=a\n" - " endif\n" - " endif\n" - " endif\n" - " endif\n" - " halt\n" - "end\n"; - } - else - pcomp="end\n"; - } - else - error("Unsupported method"); - - // Build context model (comp, hcomp) assuming: - // H[0..254] = contexts - // H[255..511] = location of last byte i-255 - // M = last 64K bytes, filling backward - // C = pointer to most recent byte - // R1 = level 2 lz77 1+bytes expected until next code, 0=init - // R2 = level 2 lz77 first byte of code - int ncomp=0; // number of components - const int membits=args[0]+20; - int sb=5; // bits in last context - std::string comp; - std::string hcomp="hcomp\n" - "c-- *c=a a+= 255 d=a *d=c\n"; - if (level==2) { // put level 2 lz77 parse state in R1, R2 - hcomp+= - " (decode lz77 into M. Codes:\n" - " 00xxxxxx = literal length xxxxxx+1\n" - " xx......, xx > 0 = match with xx offset bytes to follow)\n" - "\n" - " a=r 1 a== 0 if (init)\n" - " a= "+itos(111+57*doe8)+" (skip post code)\n" - " else a== 1 if (new code?)\n" - " a=*c r=a 2 (save code in R2)\n" - " a> 63 if a>>= 6 a++ a++ (match)\n" - " else a++ a++ endif (literal)\n" - " else (read rest of code)\n" - " a--\n" - " endif endif\n" - " r=a 1 (R1 = 1+expected bytes to next code)\n"; - } - - // Generate the context model - while (*method && ncomp<254) { - - // parse command C[N1[,N2]...] into v = {C, N1, N2...} - std::vector v; - v.push_back(*method++); - if (isdigit(*method)) { - v.push_back(*method++-'0'); - while (isdigit(*method) || *method==',' || *method=='.') { - if (isdigit(*method)) - v.back()=v.back()*10+*method++-'0'; - else { - v.push_back(0); - ++method; - } - } - } - - // c: context model - // N1%1000: 0=ICM 1..256=CM limit N1-1 - // N1/1000: number of times to halve memory - // N2: 1..255=offset mod N2. 1000..1255=distance to N2-1000 - // N3...: 0..255=byte mask + 256=lz77 state. 1000+=run of N3-1000 zeros. - if (v[0]=='c') { - while (v.size()<3) v.push_back(0); - comp+=itos(ncomp)+" "; - sb=11; // count context bits - if (v[2]<256) sb+=lg(v[2]); - else sb+=6; - for (unsigned i=3; imembits) sb=membits; - if (v[1]%1000==0) comp+="icm "+itos(sb-6-v[1]/1000)+"\n"; - else comp+="cm "+itos(sb-2-v[1]/1000)+" "+itos(v[1]%1000-1)+"\n"; - - // special contexts - hcomp+="d= "+itos(ncomp)+" *d=0\n"; - if (v[2]>1 && v[2]<=255) { // periodic context - if (lg(v[2])!=lg(v[2]-1)) - hcomp+="a=c a&= "+itos(v[2]-1)+" hashd\n"; - else - hcomp+="a=c a%= "+itos(v[2])+" hashd\n"; - } - else if (v[2]>=1000 && v[2]<=1255) // distance context - hcomp+="a= 255 a+= "+itos(v[2]-1000)+ - " d=a a=*d a-=c a> 255 if a= 255 endif d= "+ - itos(ncomp)+" hashd\n"; - - // Masked context - for (unsigned i=3; i0 && v[i]<255) - hcomp+="a=*b a&= "+itos(v[i])+" hashd\n"; // masked byte - else if (v[i]>=256 && v[i]<512) { // lz77 state or masked literal byte - hcomp+= - "a=r 1 a> 1 if\n" // expect literal or offset - " a=r 2 a< 64 if\n" // expect literal - " a=*b "; - if (v[i]<511) hcomp+="a&= "+itos(v[i]-256); - hcomp+=" hashd\n" - " else\n" // expect match offset byte - " a>>= 6 hashd a=r 1 hashd\n" - " endif\n" - "else\n" // expect new code - " a= 255 hashd a=r 2 hashd\n" - "endif\n"; - } - else if (v[i]>=1256) // skip v[i]-1000 bytes - hcomp+="a= "+itos(((v[i]-1000)>>8)&255)+" a<<= 8 a+= " - +itos((v[i]-1000)&255)+ - " a+=b b=a\n"; - else if (v[i]>1000) - hcomp+="a= "+itos(v[i]-1000)+" a+=b b=a\n"; - if (v[i]<512 && iint(v[0]=='t')) { - if (v.size()<=1) v.push_back(8); - if (v.size()<=2) v.push_back(24+8*(v[0]=='s')); - if (v[0]=='s' && v.size()<=3) v.push_back(255); - comp+=itos(ncomp); - sb=5+v[1]*3/4; - if (v[0]=='m') - comp+=" mix "+itos(v[1])+" 0 "+itos(ncomp)+" "+itos(v[2])+" 255\n"; - else if (v[0]=='t') - comp+=" mix2 "+itos(v[1])+" "+itos(ncomp-1)+" "+itos(ncomp-2) - +" "+itos(v[2])+" 255\n"; - else // s - comp+=" sse "+itos(v[1])+" "+itos(ncomp-1)+" "+itos(v[2])+" " - +itos(v[3])+"\n"; - if (v[1]>8) { - hcomp+="d= "+itos(ncomp)+" *d=0 b=c a=0\n"; - for (; v[1]>=16; v[1]-=8) { - hcomp+="a<<= 8 a+=*b"; - if (v[1]>16) hcomp+=" b++"; - hcomp+="\n"; - } - if (v[1]>8) - hcomp+="a<<= 8 a+=*b a>>= "+itos(16-v[1])+"\n"; - hcomp+="a<<= 8 *d=a\n"; - } - ++ncomp; - } - - // i: ISSE chain with order increasing by N1,N2... - if (v[0]=='i' && ncomp>0) { - assert(sb>=5); - hcomp+="d= "+itos(ncomp-1)+" b=c a=*d d++\n"; - for (unsigned i=1; imembits) sb=membits; - comp+=itos(ncomp)+" isse "+itos(sb-6-v[i]/10)+" "+itos(ncomp-1)+"\n"; - ++ncomp; - } - } - - // a24,0,0: MATCH. N1=hash multiplier. N2,N3=halve buf, table. - if (v[0]=='a') { - if (v.size()<=1) v.push_back(24); - while (v.size()<4) v.push_back(0); - comp+=itos(ncomp)+" match "+itos(membits-v[3]-2)+" " - +itos(membits-v[2])+"\n"; - hcomp+="d= "+itos(ncomp)+" a=*d a*= "+itos(v[1]) - +" a+=*c a++ *d=a\n"; - sb=5+(membits-v[2])*3/4; - ++ncomp; - } - - // w1,65,26,223,20,0: ICM-ISSE chain of length N1 with word contexts, - // where a word is a sequence of c such that c&N4 is in N2..N2+N3-1. - // Word is hashed by: hash := hash*N5+c+1 - // Decrease memory by 2^-N6. - if (v[0]=='w') { - if (v.size()<=1) v.push_back(1); - if (v.size()<=2) v.push_back(65); - if (v.size()<=3) v.push_back(26); - if (v.size()<=4) v.push_back(223); - if (v.size()<=5) v.push_back(20); - if (v.size()<=6) v.push_back(0); - comp+=itos(ncomp)+" icm "+itos(membits-6-v[6])+"\n"; - for (int i=1; i0; --i) - hcomp+=" d= "+itos(ncomp+i-1)+" a=*d d++ *d=a\n"; - hcomp+=" d= "+itos(ncomp)+" *d=0\n" - "endif\n"; - ncomp+=v[1]-1; - sb=membits-v[6]; - ++ncomp; - } - } - return hdr+itos(ncomp)+"\n"+comp+hcomp+"halt\n"+pcomp; -} - -// Compress from in to out in 1 segment in 1 block using the algorithm -// descried in method. If method begins with a digit then choose -// a method depending on type. Save filename and comment -// in the segment header. If comment is 0 then the default is the input size -// as a decimal string, plus " jDC\x01" for a journaling method (method[0] -// is not 's'). Write the generated method to methodOut if not 0. -void compressBlock(StringBuffer* in, Writer* out, const char* method_, - const char* filename, const char* comment, bool dosha1) { - assert(in); - assert(out); - assert(method_); - assert(method_[0]); - std::string method=method_; - const unsigned n=in->size(); // input size - const int arg0=MAX(lg(n+4095)-20, 0); // block size - assert((1u<<(arg0+20))>=n+4096); - - // Get type from method "LB,R,t" where L is level 0..5, B is block - // size 0..11, R is redundancy 0..255, t = 0..3 = binary, text, exe, both. - unsigned type=0; - if (isdigit(method[0])) { - int commas=0, arg[4]={0}; - for (int i=1; ic_str(), n); - sha1ptr=sha1.result(); - } - - // Expand default methods - if (isdigit(method[0])) { - const int level=method[0]-'0'; - assert(level>=0 && level<=9); - - // build models - const int doe8=(type&2)*2; - method="x"+itos(arg0); - std::string htsz=","+itos(19+arg0+(arg0<=6)); // lz77 hash table size - std::string sasz=","+itos(21+arg0); // lz77 suffix array size - - // store uncompressed - if (level==0) - method="0"+itos(arg0)+",0"; - - // LZ77, no model. Store if hard to compress - else if (level==1) { - if (type<40) method+=",0"; - else { - method+=","+itos(1+doe8)+","; - if (type<80) method+="4,0,1,15"; - else if (type<128) method+="4,0,2,16"; - else if (type<256) method+="4,0,2"+htsz; - else if (type<960) method+="5,0,3"+htsz; - else method+="6,0,3"+htsz; - } - } - - // LZ77 with longer search - else if (level==2) { - if (type<32) method+=",0"; - else { - method+=","+itos(1+doe8)+","; - if (type<64) method+="4,0,3"+htsz; - else method+="4,0,7"+sasz+",1"; - } - } - - // LZ77 with CM depending on redundancy - else if (level==3) { - if (type<20) // store if not compressible - method+=",0"; - else if (type<48) // fast LZ77 if barely compressible - method+=","+itos(1+doe8)+",4,0,3"+htsz; - else if (type>=640 || (type&1)) // BWT if text or highly compressible - method+=","+itos(3+doe8)+"ci1"; - else // LZ77 with O0-1 compression of up to 12 literals - method+=","+itos(2+doe8)+",12,0,7"+sasz+",1c0,0,511i2"; - } - - // LZ77+CM, fast CM, or BWT depending on type - else if (level==4) { - if (type<12) - method+=",0"; - else if (type<24) - method+=","+itos(1+doe8)+",4,0,3"+htsz; - else if (type<48) - method+=","+itos(2+doe8)+",5,0,7"+sasz+"1c0,0,511"; - else if (type<900) { - method+=","+itos(doe8)+"ci1,1,1,1,2a"; - if (type&1) method+="w"; - method+="m"; - } - else - method+=","+itos(3+doe8)+"ci1"; - } - - // Slow CM with lots of models - else { // 5..9 - - // Model text files - method+=","+itos(doe8); - if (type&1) method+="w2c0,1010,255i1"; - else method+="w1i1"; - method+="c256ci1,1,1,1,1,1,2a"; - - // Analyze the data - const int NR=1<<12; - int pt[256]={0}; // position of last occurrence - int r[NR]={0}; // count repetition gaps of length r - const unsigned char* p=in->data(); - if (level>0) { - for (unsigned i=0; i0 && kscore) score=s, period=j; - t+=r[j]; - } - if (period>4 && score>0.1) { - method+="c0,0,"+itos(999+period)+",255i1"; - if (period<=255) - method+="c0,"+itos(period)+"i1"; - n1-=r[period]; - r[period]=0; - } - else - break; - } - method+="c0,2,0,255i1c0,3,0,0,255i1c0,4,0,0,0,255i1mm16ts19t0"; - } - } - - // Compress - std::string config; - int args[9]={0}; - config=makeConfig(method.c_str(), args); - assert(n<=(0x100000u<=1 && args[1]<=7 && args[1]!=4) { // LZ77 or BWT - LZBuffer lz(*in, args); - co.setInput(&lz); - co.compress(); - } - else { // compress with e8e9 or no preprocessing - if (args[1]>=4 && args[1]<=7) - e8e9(in->data(), in->size()); - co.setInput(in); - co.compress(); - } -#ifdef DEBUG // verify pre-post processing are inverses - int64_t outsize; - const char* sha1result=co.endSegmentChecksum(&outsize, dosha1); - assert(sha1result); - assert(sha1ptr); - if (memcmp(sha1result, sha1ptr, 20)!=0) - error("Pre/post-processor test failed"); -#else - co.endSegment(sha1ptr); -#endif - co.endBlock(); -} - - // Handle errors in libzpaq and elsewhere - void error(const char* msg) { - if (strstr(msg, "ut of memory")) throw std::bad_alloc(); - throw std::runtime_error(msg); - } - -} // end namespace libzpaq - - diff --git a/lib/algorithm/compression/libzpaq.h b/lib/algorithm/compression/libzpaq.h deleted file mode 100644 index f790c7a..0000000 --- a/lib/algorithm/compression/libzpaq.h +++ /dev/null @@ -1,1511 +0,0 @@ -/* libzpaq.h - LIBZPAQ Version 7.12 header - Apr. 19, 2016. - - This software is provided as-is, with no warranty. - I, Matt Mahoney, release this software into - the public domain. This applies worldwide. - In some countries this may not be legally possible; if so: - I grant anyone the right to use this software for any purpose, - without any conditions, unless such conditions are required by law. - -LIBZPAQ is a C++ library providing data compression and decompression -services using the ZPAQ level 2 format as described in -http://mattmahoney.net/zpaq/ - -An application wishing to use these services should #include "libzpaq.h" -and link to libzpaq.cpp (and advapi32.lib in Windows/VC++). -libzpaq recognizes the following options: - - -DDEBUG Turn on assertion checks (slower). - -DNOJIT Don't assume x86-32 or x86-64 with SSE2 (slower). - -Dunix Without -DNOJIT, assume Unix (Linux, Mac) rather than Windows. - -The application must provide an error handling function and derived -implementations of two abstract classes, Reader and Writer, -specifying the input and output byte streams. For example, to compress -from stdin to stdout (assuming binary I/O as in Linux): - - #include "libzpaq.h" - #include - #include - - void libzpaq::error(const char* msg) { // print message and exit - fprintf(stderr, "Oops: %s\n", msg); - exit(1); - } - - class In: public libzpaq::Reader { - public: - int get() {return getchar();} // returns byte 0..255 or -1 at EOF - } in; - - class Out: public libzpaq::Writer { - public: - void put(int c) {putchar(c);} // writes 1 byte 0..255 - } out; - - int main() { - libzpaq::compress(&in, &out, "1"); // "0".."5" = faster..better - } - -Or to decompress: - - libzpaq::decompress(&in, &out); - -The function error() will be called with an English language message -in case of an unrecoverable error such as badly formatted compressed -input data or running out of memory. error() should not return. -In a multi-threaded application where ZPAQ blocks are being decompressed -in separate threads, error() should exit the thread, but other threads -may continue. Blocks are independent and libzpaq is thread safe. - -Reader and Writer provide default implementations of read() and write() -for block I/O. You may override these with your own versions, which -might be faster. The default is to call get() or put() the appropriate -number of times. For example: - - // Read n bytes into buf[0..n-1] or to EOF, whichever is first. - // Return the number of bytes actually read. - int In::read(char* buf, int n) {return fread(buf, 1, n, stdin);} - - // Write buf[0..n-1] - void Out::write(char* buf, int n) {fwrite(buf, 1, n, stdout);} - -By default, compress() divides the input into blocks with one segment -each. The segment filename field is empty. The comment field of each -block is the uncompressed size as a decimal string. The checksum -is saved. To override: - - compress(&in, &out, "1", "filename", "comment", false); - -If the filename is not NULL then it is saved in the first block only. -If the comment is not NULL then a space and the comment are appended -to the decimal size in the first block only. The comment would normally -be the date and attributes like "20141231235959 w32", or "jDC\x01" for -a journaling archive as described in the ZPAQ specification. - -The method string has the general form of a concatenation of single -character commands each possibly followed by a list of decimal -numeric arguments separated by commas or periods: - - {012345xciawmst}[N1[{.,}N2]...]... - -For example "1" or "14,128,0" or "x6.3ci1m". - -Only the first command can be a digit 0..5. If it is, then it selects -a compression level and the other commands are ignored. Otherwise, -if it is "x" then the arguments and remaining commands describe -the compression method. Any other letter as the first command is -interpreted the same as "x". - -Higher compression levels are slower but compress better. "1" is -good for most purposes. "0" does not compress. "2" compresses slower -but decompression is just as fast as 1. "3", "4", and "5" also -decompress slower. The numeric arguments are as follows: - - N1: 0..11 = block size of at most 2^N1 MiB - 4096 bytes (default 4). - N2: 0..255 = estimated ease of compression (default 128). - N3: 0..3 = data type. 1 = text, 2 = exe, 3 = both (default 0). - -For example, "14" or "54" divide the input in 16 MB blocks which -are compressed independently. N2 and N3 are hints to the compressor -based on analysis of the input data. N2 is 0 if the data is random -or 255 if the data is easily compressed (for example, all zero bytes). -Most compression methods will simply store random data with no -compression. The default is "14,128,0". - -If the first command is "x" then the string describes the exact -compression method. The arguments to "x" describe the pre/post -processing (LZ77, BWT, E8E9), and remaining commands describe the -context model, if any, of the transformed data. The arguments to "x" are: - - N1: 0..11 = block size as before. - N2: 0..7: 0=none, 1=packed LZ77, 2=LZ77, 3=BWT, 4..7 = 0..3 + E8E9. - N3: 4..63: LZ77 min match. - N4: LZ77 secondary match to try first or 0 to skip. - N5: LZ77 log search depth. - N6: LZ77 log hash table size, or N1+21 to use a suffix array. - N7: LZ77 lookahead. - -N2 selects the basic transform applied before context modeling. -N2 = 0 does not transform the input. N2 = 1 selects LZ77 encoding -of literals strings and matches using bit-packed codes. It is normally -not used with a context model. N2 = 2 selects byte aligned LZ77, which -compresses worse by itself but better than 1 when a context model is -used. It uses single bytes to encode either a literal of length 1..64 -or a match of length N3..N3+63 with a 2, 3, or 4 byte offset. - -N2 = 3 selects a Burrows-Wheeler transform, in which the input is -sorted by right-context. This does not compress by itself but makes -the data more compressible using a low order, adaptive context model. -BWT requires 4 times the block size in additional memory for both -compression and decompression. - -N2 = 4..7 are the same as 0..3 except that a E8E9 transform is first applied -to improve the compression of x86 code usually found .exe and .dll files. -It scans the input block backward for 5 byte strings of the form -{E8|E9 xx xx xx 00|FF} and adds the offset from the start of the -block to the middle 3 bytes interpreted as a little-endian (LSB first) -number (mod 2^24). E8 and E9 are the CALL and JMP instructions, followed -by a 32 bit relative offset. - -N3..N7 apply only to LZ77. For either type, it searches for matches -by hashing the next N4 bytes, and then the next N3 bytes, and looking -up each of the hashes at 2^N5 locations in a table with 2^N6 entries. -Of those, it picks the longest match, or closest in case of a tie. -If no match is at least N3, then a literal is encoded instead. If N5 -is 0 then only one hash is computed, which is faster but does not -compress as well. Typical good values for fast compression are -"x4.1.5.0.3.22" which means 16 MiB blocks, packed LZ77, mininum match -length 5, no secondary match, search depth 2^3 = 8, and 2^22 = 4M -hash table (using 16 MiB memory). - -The hash table requires 4 x 2^N6 bytes of memory. If N6 = N1+21, then -matches are found using a suffix array and inverse suffix array using -2.25 x 2^N6 bytes (4.5 x block size). This finds better matches but -takes longer to compute the suffix array (SA). The matches are found by -searching forward and backward in the SA 2^N5 in each direction up -to the first earlier match, and picking the longer of the two. -Good values are "x4.1.4.0.8.25". The secondary match N4 has no effect. - -N7 is the lookahead. It looks for matches of length at least N4+N7 -when using a hash table or N3+N7 for a SA, but allows the first N7 -bytes not to match and be coded as literals if this results in -a significantly longer match. Values higher than 1 are rarely effective. -The default is 0. - -All subsequent commands after "x" describe a context model. A model -consists of a set of components that output a bit prediction, taking -a context and possibly earlier predictions as input. The final prediction -is arithmetic coded. The component types are: - - c = CM or ICM (context model or indirect context model). - i = ISSE chain (indirect secondary symbol estimator). - a = MATCH. - w = word model (ICM-ISSE chain with whole word contexts). - m = MIX. - s = SSE (secondary symbol estimator). - t = MIX2 (2 input MIX). - -For example, "x4.3ci1" describes a BWT followed by an order 0 CM -and order 1 ISSE, which is used for level 3 text compression. The -parameters to "c" (default all 0) are as follows: - - N1: 0 = ICM, 1..256 CM with faster..slower adaptation, +1000 halves memory. - N2: 1..255 = offset mod N2, 1000..1255 = offset to last N2-1000 byte. - N3: 0..255 = order 0 context mask, 256..511 mixes LZ77 parse state. - N4...: 0..255 order 1... context masks. 1000... skips N4-1000 bytes. - -Most components use no more memory than the block size, depending on -the number of context bits, but it is possible to select less memory -and lose compression. - -A CM inputs a context hash and outputs a prediction from a table. -The table entry is then updated by adjusting in the direction of the -actual bit. The adjustment is 1/count, where the maximum count is 4 x N1. -Larger values are best for stationary data. Smaller values adapt faster -to changing data. - -If N1 is 0 then c selects an ICM. An ICM maps a context to a bit history -(8 bit state), and then to slow adapting prediction. It is generally -better than a CM on most nonstationary data. - -The context for a CM or ICM is a hash of all selected contexts: a -cyclic counter (N2 = 1..255), the distance from the last occurrence -of some byte value (N2 = 1000..1255), and the masked history of the -last 64K bytes ANDED with N3, N4... For example, "c0.0.255.255.255" is -an order 3 ICM. "C0.1010.255" is an order 1 context hashed together -with the column number in a text file (distance to the last linefeed, -ASCII 10). "c256.0.255.1511.255" is a stationary grayscale 512 byte -wide image model using the two previous neighboring pixels as context. -"c0.0.511.255" is an order 1 model for LZ77, which helps compress -literal strings. The LZ77 state context applies only to byte aligned -LZ77 (type 2 or 6). - -The parameters to "i" (ISSE chain) are the initial context length and -subsequent increments for a chain connected to an existing earlier component. -For example, "ci1.1.2" specifies an ICM (order 0) followed by a chain -of 3 ISSE with orders 1, 2, and 4. An ISSE maps a context to a bit -history like an ISSE, but uses the history to select a pair of weights -to mix the input prediction with a constant 1, thus performing the -mapping q' := w1 x q + w2 in the logistic domain (q = log p/(1-p)). -The mixer is then updated by adjusting the weights to improve the -prediction. High order ISSE chains (like "x4.0ci1.1.1.1.2") and BWT -followed by a low order chain (like "x4.3ci1") both provide -excellent general purpose compression. - -A MATCH ("a") keeps a rotating history buffer and a hash table to look -up the previous occurrence of the current context hash and predicts -whatever bit came next. The parameters are: - - N1 = hash multiplier, default 24. - N2 = halve buffer size, default 0 = same size as input block. - N3 = halve hash table size, default 0 = block size / 4. - -For example, "x4.0m24.1.1" selects a 16 MiB block size, 8 MiB match -buffer size, and 2M hash table size (using 8 MiB at 4 bytes per entry). -The hash is computed as hash := hash x N1 + next_byte + 1 (mod hash table -size). Thus, N1 = 12 selects a higher order context, and N1 = 48 selects a -lower order. - -A word model ('w") is an ICM-ISSE chain of length N1 (orders 0..N1-1) -in which the contexts are whole words. A word is defined as the set -of characters in the range N2..N2+N3-1 after ANDing with N4. The context -is hashed using multiplier N5. Memory is halved by N6. The default is -"w1.65.26.223.20.0" which is a chain of length 1 (ICM only), where words -are in range 65 ('A') to 65+26-1 ('Z') after ANDing with 223 (which -converts to upper case). The hash multiplier is 20, which has the -effect of shifting the high 2 bits out of the hash. The memory usage -of each component is the same as the block size. - -A MIX ("m") performs the weighted average of all previous component -predictions. The weights are then adjusted to improve the prediction -by favoring the most accurate components. N1 selects the number of -context bits (not hashed) to select a set of weights. N2 is the -learning rate (around 16..32 works well). The default is "m8.24" -which selects the previously modeled bits of the current byte as -context. When N1 is not a multiple of 8, it selects the most significant -bits of the oldest byte. - -A SSE ("s") adjusts the previous prediction like an ISSE, but uses -a direct lookup table of the quantized and interpolated input prediction -and a direct (not hashed) N1-bit context. The adjustment is 1/count where -the count is allowed to range from N2 to 4 x N3. The default -is "s8.32.255". - -A MIX2 ("t") is a MIX but mixing only the last 2 components. The -default is "t8.24" where the meaning is the same as "m". - -For example, a good model for text is "x6.0ci1.1.1.1.2aw2mm16tst" -which selects 2^6 = 64 MiB blocks, no preprocessing, an order 0 ICM, -an ISSE chain with orders 1, 2, 3, 4, 6, a MATCH, an order 0-1 word -ICM-ISSE chain, two mixers with 0 and 1 byte contexts, whose outputs are -mixed by a MIX2. The MIX2 output is adjusted by a SSE, and finally -the SSE input and outputs are mixed again for the final bit prediction. - - -COMPRESSBLOCK - -CompressBlock() takes the same arguments as compress() except that -the input is a StringBuffer instead of a Reader. The output is always -a single block, regardless of the N1 (block size) argument in the method. - - void compressBlock(StringBuffer* in, Writer* out, const char* method, - const char* filename=0, const char* comment=0, - bool compute_sha1=false); - -A StringBuffer is both a Reader and a Writer, but also allows random -memory access. It provides convenient and efficient storage when the -input size is unknown. - - class StringBuffer: public libzpaq::Reader, public libzpaq::Writer { - public: - StringBuffer(size_t n=0); // initial allocation after first use - ~StringBuffer(); - int get(); // read 1 byte or EOF from memory - int read(char* buf, int n); // read n bytes - void put(int c); // write 1 byte to memory - void write(const char* buf, int n); // write n bytes - const char* c_str() const; // read-only access to written data - unsigned char* data(); // read-write access - size_t size() const; // number of bytes written - size_t remaining() const; // number of bytes to read until EOF - void setLimit(size_t n); // set maximum write size - void reset(); // discard contents and free memory - void resize(size_t n); // truncate to n bytes - void swap(StringBuffer& s); // exchange contents efficiently - }; - -The constructor sets the inital allocation size after the first -write to n or 128, whichever is larger. Initially, no memory is allocated. -The allocated size is always n x (2^k - 1), for example -128 x (1, 3, 7, 15, 31...). - -put() and write() append 1 or n bytes, allocating memory as needed. -buf can be NULL and the StringBuffer will be enlarged by n. -get() and read() read 1 or up to n bytes. get() returns EOF if you -attempt to read past the end of written data. read() returns less -than n if it reaches EOF first, or 0 at EOF. - -size() is the number of bytes written, which does not change when -data is read. remaining() is the number of bytes left to read -before EOF. - -c_str() provides read-only access to the data. It is not NUL terminated. -data() provides read-write access. Either may return NULL if size() -is 0. write(), put(), reset(), swap(), and the destructor may -invalidate saved pointers. - -setLimit() sets a maximum size. It will call error() if you try to -write past it. The default is -1 or no limit. - -reset() sets the size to 0 and frees memory. resize() sets the size -to n by moving the write pointer, but does not allocate or free memory. -Moving the pointer forward does not overwrite the previous contents -in between. The write pointer can be moved past the end of allocated -memory, and the next put() or write() will allocate as needed. If the -write pointer is moved back before the read pointer, then remaining() -is set to 0. - -swap() swaps 2 StringBuffers efficiently, but does not change their -initial allocations. - - -DECOMPRESSER - -decompress() will decompress any valid ZPAQ stream, which may contain -multiple blocks with multiple segments each. It will ignore filenames, -comments, and checksums. You need the Decompresser class if you want to -do something other than decompress all of the data serially to a single -file. To decompress individual blocks and segments and retrieve the -filenames, comments, data, and hashes of each segment (in exactly this -order): - - libzpaq::Decompresser d; // to decompress - libzpaq::SHA1 sha1; // to verify output hashes - double memory; // bytes required to decompress - Out filename, comment; - char sha1out[21]; - d.setInput(&in); - while (d.findBlock(&memory)) { // default is NULL - while (d.findFilename(&filename)) { // default is NULL - d.readComment(&comment); // default is NULL - d.setOutput(&out); // if omitted or NULL, discard output - d.setSHA1(&sha1); // optional - while (d.decompress(1000)); // bytes to decode, default is all - d.readSegmentEnd(sha1out); // {0} or {1,hash[20]} - if (sha1out[0]==1 && memcmp(sha1.result(), sha1out+1, 20)) - error("checksum error"); - } - } - -findBlock() scans the input for the next ZPAQ block and returns true -if found. It optionally sets memory to the approximate number of bytes -that it will allocate at the first call to decompress(). - -findFilename() finds the next segment and returns false if there are -no more in the current block. It optionally writes the saved filename. - -readComment() optionally writes the comment. It must be called -after reading the filename and before decompressing. - -setSHA1() specifies an SHA1 object for computing a hash of the segment. -It may be omitted if you do not want to compute a hash. - -decompress() decodes the requested number of bytes, postprocesses them, -and writes them to out. For the 3 built in compression levels, this -is the same as the number of bytes output, but it may be different if -postprocessing was used. It returns true until there is no more data -to decompress in the current segment. The default (-1) is to decompress the -whole segment. - -readSegmentEnd() skips any remaining data not yet decompressed in the -segment and writes 21 bytes, either a 0 if no hash was saved, -or a 1 followed by the 20 byte saved hash. If any data is skipped, -then all data in the remaining segments in the current block must -also be skipped. - - -SHA1 - -The SHA1 object computes SHA-1 cryptographic hashes. It is safe to -assume that two inputs with the same hash are identical. For example: - - libzpaq::SHA1 sha1; - int ch; - while ((ch=getchar())!=EOF) - sha1.put(ch); - printf("Size is %1.0f or %1.0f bytes\n", sha1.size(), double(sha1.usize())); - -size() returns the number of bytes read as a double, and usize() as a -64 bit integer. result() returns a pointer to the 20 byte hash and -resets the size to 0. The hash (not just the pointer) should be copied -before the next call to result() if you want to save it. You can also -call sha1.write(buffer, n) to hash n bytes of char* buffer. - - -COMPRESSOR - -A Compressor object allows greater control over the compressed data. -In particular you can specify the compression algorithm in ZPAQL to -specify methods not possible using compress() or compressBlock(). You -can create blocks with multiple segments specifying different files, -or compress streams of unlimited size to a single block when the -input size is not known. - - libzpaq::Compressor c; - for (int i=0; i 128) or 0 (c < 128). - CM s t context model with 2^s contexts, learning rate 1/4t. - ICM s indirect context model with 2^(s+6) contexts. - MATCH s b match model with 2^s context hashes and 2^b history. - AVG j k wt average components j and k with weight wt/256 for j. - MIX2 s j k r x average j and k with 2^s contexts, rate r, mask x. - MIX s j m r x average j..j+m-1 with 2^s contexts, rate r, mask x. - ISSE s j adjust prediction j using 2^(s+6) indirect contexts. - SSE s j t1 t2 adjust j using 2^s direct contexts, rate 1/t1..1/4t2. - -A CONST predicts a 1 with probability 1/(1+exp((128-c)/16)), i.e -numbers near 0 or 255 are the most confident. - -A CM maps a context to a prediction and a count. It is updated by -adjusting the prediction to reduce the error by 1/count and incrementing -the count up to 4t. - -A ICM maps a s+10 bit context hash to a bit history (8 bit state) -representing a bounded count of zeros and ones previously seen in the -context and which bit was last. The bit history is mapped to a -prediction, which is updated by reducing the error by 1/1024. -The initial prediction is estimated from the counts represented by each -bit history. - -A MATCH looks up a context hash and predicts whatever bit came next -following the previous occurrence in the history buffer. The strength -of the prediction depends on the match length. - -AVG, MIX2, and MIX perform weighted averaging of predictions in the -logistic domain (log(p/(1-p))). AVG uses a fixed weight. MIX2 and MIX -adjust the weights (selected by context) to reduce prediction error -by a rate that increases with r. The mask is AND-ed with the current -partially coded byte to compute that context. Normally it is 255. -A MIX takes a contiguous range of m components as input. - -ISSE adjusts a prediction using a bit history (as with an ICM) to -select a pair of weights for a 2 input MIX. It mixes the input -prediction with a constant 1 in the logistic domain. - -SSE adjusts a logistic prediction by quantizing it to 32 levels and -selecting a new prediction from a table indexed by context, interpolating -between the nearest two steps. The nearest prediction error is -reduced by 1/count where count increments from t1 to 4*t2. - -Contexts are computed and stored in an array H of 32 bit unsigned -integers by the HCOMP program written in ZPAQL. The program is called -after encoding a whole byte. To form a complete context, these values -are combined with the previous 0 to 7 bits of the current parital byte. -The method depends on the component type as follows: - - CM: H[i] XOR hmap4(c). - ICM, ISSE: hash table lookup of (H[i]*16+c) on nibble boundaries. - MIX2, MIX: H[i] + (c AND x). - SSE: H[i] + c. - -where c is the previous bits with a leading 1 bit (1, 1x, 1xx, ..., -1xxxxxxx where x is a previously coded bit). hmap4(c) maps c -to a 9 bit value to reduce cache misses. The first nibble is -mapped as before and the second nibble with 1xxxx in the high -5 bits. For example, after 6 bits, where c = 1xxxxxx, -hmap4(c) = 1xxxx01xx with the bits in the same order. - -There are two ZPAQL virtual machines, HCOMP to compute contexts -and PCOMP to post-process the decoded output. Each has the -following state: - - PC: 16 bit program counter. - A, B, C, D, R0...R255: 32 bit unsigned registers. - F: 1 bit condition register. - H: array of 2^h 32 bit unsigned values (output for HCOMP). - M: array of 2^m 8 bit unsigned values. - -All values are initialized to 0 at the beginning of a block -and retain their values between calls. There are two machines. -HCOMP is called after coding each byte with the value of that -byte in A. PCOMP, if present, is called once for each decoded -byte with that byte in A, and once more at the end of each -segment with 2^32 - 1 in A. - -Normally, A is an accumulator. It is the destination of all -binary operations except assignment. The low m bits of B and -C index M. The low h bits of D indexes H. We write *B, *C, *D -to refer to the elements they point to. The instruction set -is as follows, where X is A, B, C, D, *B, *C, *D except as -indicated. X may also be a constant 0...255, written with -a leading space if it appears on the right side of an operator, -e.g. "*B= 255". Instructions taking a numeric argument are 2 bytes, -otherwise 1. Arithmetic is modulo 2^32. - - X<>A Swap X with A (X cannot be A). - X++ Add 1. - X-- Subtract 1. - X! Complement bits of X. - X=0 Clear X (1 byte instruction). - X=X Assignment to left hand side. - A+=X Add to A - A-=X Subtract from A - A*=X Multipy - A/=X Divide. If X is 0 then A=0. - A%=X Mod. If X is 0 then A=0. - A&=X Clear bits of A that are 0 in X. - A&~X Clear bits of A that are 1 in X. - A|=X Set bits of A that are 1 in X. - A^=X Complement bits of A that are set in X. - A<<=X Shift A left by (X mod 32) bits. - A>>=X Shift right (zero fill) A by (X mod 32) bits. - A==X Set F=1 if equal else F=0. - AX Set F=1 if greater else F=0. - X=R N Set A,B,C,D to RN (R0...R255). - R=A N Set R0...R255 to A. - JMP N Jump N=-128...127 bytes from next instruction. - JT N Jump N=-128...127 if F is 1. - JF N Jump N=-128...127 if F is 0. - LJ N Long jump to location 0...65535 (only 3 byte instruction). - OUT Output A (PCOMP only). - HASH A=(A+*B+512)*773. - HASHD *D=(*D+A+512)*773. - HALT Return at end of program. - ERROR Fail if executed. - -Rather than using jump instructions, the following constructs are -allowed and translated appropriately. - - IF ... ENDIF Execute if F is 1. - IFNOT ... ENDIF Execute if F is 0. - IF ... ELSE ... ENDIF Execute first part if F is 1 else second part. - IFNOT ... ELSE ... ENDIF Execute first part if F is 0 else second part. - DO ... WHILE Loop while F is 1. - DO ... UNTIL Loop while F is 0. - DO ... FOREVER Loop unconditionally. - -Forward jumps (IF, IFNOT, ELSE) will not compile if beyond 127 -instructions. In that case, use the long form (IFL, IFNOTL, ELSEL). -DO loops automatically use long jumps if needed. IF and DO loops -may intersect. For example, DO ... IF ... FOREVER ENDIF is equivalent -to a while-loop. - -A config argument without a postprocessor has the following syntax: - - COMP hh hm ph pm n - i COMP args... - HCOMP - zpaql... - END (or POST 0 END for backward compatibility) - -With a postprocessor: - - COMP hh hm ph pm n - i COMP args... - HCOMP - zpaql... - PCOMP command args... ; - zpaql... - END - -In HCOMP, H and M have sizes 2^hh and 2^hm respectively. In PCOMP, -H and M have sizes 2^ph and 2^pm respectively. There are n components, -which must be numbered i = 0 to n-1. If a postprocessor is used, then -"command args..." is written to the Writer* passed as the 4'th argument, -but otherwise ignored. A typical use in a development environment might -be to call an external program that will be passed two additional -arguments on the command line, the input and output file names -respectively. - -You can pass up to 9 signed numeric arguments in args[]. In any -place that a number "N" is allowed, you can write "$M" or "$M+N" -(like "$1" or $9+25") and value args[M-1]+N will be substituted. - -ZPAQL allows (nested) comments in parenthesis. It is not case sensitive. -If there are input errors, then error() will report the error. If the -string contains newlines, it will report the line number of the error. - -ZPAQL is compiled internally into a byte code, and then to native x86 -32 or 64 bit code (unless compiled with -DNOJIT, in which case the -byte code is interpreted). You can also specify the algorithm directly -in byte code, although this is less convenient because it requires two -steps: - - c.startBlock(hcomp); // COMP and HCOMP at start of block - c.postProcess(pcomp, 0); // PCOMP right before compress() in first segment - -This is necessary because the COMP and HCOMP sections are stored in -the block header, but the PCOMP section is compressed in the first -segment after the filename and comment but before any data. - -To retrive compiled byte code in suitable format after startBlock(): - - c.hcomp(&out); // writes COMP and HCOMP sections - c.pcomp(&out); // writes PCOMP section if any - -Or during decompression: - - d.hcomp(&out); // valid after findBlock() - d.pcomp(&out); // valid after decompress(0) in first segment - -Both versions of pcomp() write nothing and return false if there is no -PCOMP section. The output of hcomp() and pcomp() may be passed to the -input of startBlock() and postProcess(). These are strings in which the -first 2 bytes encode the length of the rest of the string, least -significant byte first. Alternatively, postProcess() allows the length to -be omitted and passed separately as the second argument. In the case -of decompression, the HCOMP and PCOMP strings are read from the archive. -The preprocessor command (from "PCOMP cmd ;") is not saved in the compressed -data. - - -ARRAY - -The libzpaq::Array template class is convenient for creating arrays aligned -on 64 byte addresses. It calls error("Out of memory") if needed. -It is used as follows: - - libzpaq::Array a(n); // array a[0]..a[n-1] of type T, zeroed - a.resize(n); // change size and zero contents - a[i] // i'th element - a(i) // a[i%n], valid only if n is a power of 2 - a.size() // n (as a size_t) - a.isize() // n (as a signed int) - -T should be a simple type without constructors or destructors. Arrays -cannot be copied or assigned. You can also specify the size: - - Array a(n, e); // n << e - a.resize(n, e); // n << e - -which is equivalent to n << e except that it calls error("Array too big") -rather than overflow if n << e would require more than 32 bits. If -compiled with -DDEBUG, then bounds are checked at run time. - - -ENCRYPTION - -There is a class libzpaq::SHA256 with put(), result(), size(), and usize() -as in SHA1. result() returns a 32 byte SHA-256 hash. It is used by scrypt. - -The libzpaq::AES_CTR class allows encryption in CTR mode with 128, 192, -or 256 bit keys. The public members are: - -class AES_CTR { -public: - AES_CTR(const char* key, int keylen, char* iv=0); - void encrypt(U32 s0, U32 s1, U32 s2, U32 s3, unsigned char* ct); - void encrypt(char* buf, int n, U64 offset); -}; - -The constructor initializes with a 16, 24, or 32 byte key. The length -is given by keylen. iv can be an 8 byte string or NULL. If not NULL -then iv0, iv1 are initialized with iv[0..7] in big-endian order, else 0. - -encrypt(s0, s1, s2, s3, ct) encrypts a plaintext block divided into -4 32-bit words MSB first. The first byte of plaintext is the high 8 -bits of s0. The output is to ct[16]. - -encrypt(buf, n, offset) encrypts or decrypts an n byte slice of a string -starting at offset. The i'th 16 byte block is encrypted by XOR with -the result (in ct) of encrypt(iv0, iv1, i>>32, i&0xffffffff, ct) starting -with i = 0. For example: - - AES_CTR a("a 128 bit key!!!", 16); - char buf[500]; // some data - a.encrypt(buf, 100, 0); // encrypt first 100 bytes - a.encrypt(buf, 400, 100); // encrypt next 400 bytes - a.encrypt(buf, 500, 0); // decrypt in one step - -libzpaq::stretchKey(char* out, const char* in, const char* salt); - -Generate a 32 byte key out[0..31] from key[0..31] and salt[0..31] -using scrypt(key, salt, N=16384, r=8, p=1). key[0..31] should be -the SHA-256 hash of the password. With these parameters, the function -uses 0.1 to 0.3 seconds and 16 MiB memory. -Scrypt is defined in http://www.tarsnap.com/scrypt/scrypt.pdf - -void random(char* buf, int n); - -Puts n cryptographic random bytes in buf[0..n-1], where the first -byte is never '7' or 'z' (start of a ZPAQ archive). For a pure -random string, discard the first byte. - -Other classes and functions defined here are for internal use. -Use at your own risk. -*/ - -////////////////////////////////////////////////////////////// - -#ifndef LIBZPAQ_H -#define LIBZPAQ_H - -#ifndef DEBUG -#define NDEBUG 1 -#endif -#include -#include -#include -#include -#include - -namespace libzpaq { - -#include "../../support/type_definitions.h" -// 1, 2, 4, 8 byte unsigned integers -typedef BYTE U8; -//typedef uint16_t U16; -//typedef uint32_t U32; -//typedef uint64_t U64; - -// Tables for parsing ZPAQL source code -extern const char* compname[256]; // list of ZPAQL component types -extern const int compsize[256]; // number of bytes to encode a component -extern const char* opcodelist[272]; // list of ZPAQL instructions - -// Callback for error handling -extern void error(const char* msg); - -// Virtual base classes for input and output -// get() and put() must be overridden to read or write 1 byte. -// read() and write() may be overridden to read or write n bytes more -// efficiently than calling get() or put() n times. -class Reader { -public: - virtual int get() = 0; // should return 0..255, or -1 at EOF - virtual int read(char* buf, int n); // read to buf[n], return no. read - virtual ~Reader() {} -}; - -class Writer { -public: - virtual void put(int c) = 0; // should output low 8 bits of c - virtual void write(const char* buf, int n); // write buf[n] - virtual ~Writer() {} -}; - -// Read 16 bit little-endian number -int toU16(const char* p); - -// An Array of T is cleared and aligned on a 64 byte address -// with no constructors called. No copy or assignment. -// Array a(n, ex=0); - creates n< -class Array { - T *data; // user location of [0] on a 64 byte boundary - size_t n; // user size - int offset; // distance back in bytes to start of actual allocation - void operator=(const Array&); // no assignment - Array(const Array&); // no copy -public: - Array(size_t sz=0, int ex=0): data(0), n(0), offset(0) { - resize(sz, ex);} // [0..sz-1] = 0 - void resize(size_t sz, int ex=0); // change size, erase content to zeros - ~Array() {resize(0);} // free memory - size_t size() const {return n;} // get size - int isize() const {return int(n);} // get size as an int - T& operator[](size_t i) {assert(n>0 && i0 && (n&(n-1))==0); return data[i&(n-1)];} -}; - -// Change size to sz< -void Array::resize(size_t sz, int ex) { - assert(size_t(-1)>0); // unsigned type? - while (ex>0) { - if (sz>sz*2) error("Array too big"); - sz*=2, --ex; - } - if (n>0) { - assert(offset>0 && offset<=64); - assert((char*)data-offset); - ::free((char*)data-offset); - } - n=0; - offset=0; - if (sz==0) return; - n=sz; - const size_t nb=128+n*sizeof(T); // test for overflow - if (nb<=128 || (nb-128)/sizeof(T)!=n) n=0, error("Array too big"); - data=(T*)::calloc(nb, 1); - if (!data) n=0, error("Out of memory"); - offset=64-(((char*)data-(char*)0)&63); - assert(offset>0 && offset<=64); - data=(T*)((char*)data+offset); -} - -//////////////////////////// SHA1 //////////////////////////// - -// For computing SHA-1 checksums -class SHA1 { -public: - void put(int c) { // hash 1 byte - U32& r=w[U32(len)>>5&15]; - r=(r<<8)|(c&255); - len+=8; - if ((U32(len)&511)==0) process(); - } - void write(const char* buf, int64_t n); // hash buf[0..n-1] - double size() const {return len/8;} // size in bytes - uint64_t usize() const {return len/8;} // size in bytes - const char* result(); // get hash and reset - SHA1() {init();} -private: - void init(); // reset, but don't clear hbuf - U64 len; // length in bits - U32 h[5]; // hash state - U32 w[16]; // input buffer - char hbuf[20]; // result - void process(); // hash 1 block -}; - -//////////////////////////// SHA256 ////////////////////////// - -// For computing SHA-256 checksums -// http://en.wikipedia.org/wiki/SHA-2 -class SHA256 { -public: - void put(int c) { // hash 1 byte - unsigned& r=w[len0>>5&15]; - r=(r<<8)|(c&255); - if (!(len0+=8)) ++len1; - if ((len0&511)==0) process(); - } - double size() const {return len0/8+len1*536870912.0;} // size in bytes - uint64_t usize() const {return len0/8+((U64)len1<<29);} //size in bytes - const char* result(); // get hash and reset - SHA256() {init();} -private: - void init(); // reset, but don't clear hbuf - unsigned len0, len1; // length in bits (low, high) - unsigned s[8]; // hash state - unsigned w[16]; // input buffer - char hbuf[32]; // result - void process(); // hash 1 block -}; - -//////////////////////////// AES ///////////////////////////// - -// For encrypting with AES in CTR mode. -// The i'th 16 byte block is encrypted by XOR with AES(i) -// (i is big endian or MSB first, starting with 0). -class AES_CTR { - U32 Te0[256], Te1[256], Te2[256], Te3[256], Te4[256]; // encryption tables - U32 ek[60]; // round key - int Nr; // number of rounds (10, 12, 14 for AES 128, 192, 256) - U32 iv0, iv1; // first 8 bytes in CTR mode -public: - AES_CTR(const char* key, int keylen, const char* iv=0); - // Schedule: keylen is 16, 24, or 32, iv is 8 bytes or NULL - void encrypt(U32 s0, U32 s1, U32 s2, U32 s3, unsigned char* ct); - void encrypt(char* buf, int n, U64 offset); // encrypt n bytes of buf -}; - -//////////////////////////// stretchKey ////////////////////// - -// Strengthen password pw[0..pwlen-1] and salt[0..saltlen-1] -// to produce key buf[0..buflen-1]. Uses O(n*r*p) time and 128*r*n bytes -// of memory. n must be a power of 2 and r <= 8. -void scrypt(const char* pw, int pwlen, - const char* salt, int saltlen, - int n, int r, int p, char* buf, int buflen); - -// Generate a strong key out[0..31] key[0..31] and salt[0..31]. -// Calls scrypt(key, 32, salt, 32, 16384, 8, 1, out, 32); -void stretchKey(char* out, const char* key, const char* salt); - -//////////////////////////// random ////////////////////////// - -// Fill buf[0..n-1] with n cryptographic random bytes. The first -// byte is never '7' or 'z'. -void random(char* buf, int n); - -//////////////////////////// ZPAQL /////////////////////////// - -// Symbolic constants, instruction size, and names -typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType; -extern const int compsize[256]; -class Decoder; // forward - -// A ZPAQL machine COMP+HCOMP or PCOMP. -class ZPAQL { -public: - ZPAQL(); - ~ZPAQL(); - void clear(); // Free memory, erase program, reset machine state - void inith(); // Initialize as HCOMP to run - void initp(); // Initialize as PCOMP to run - double memory(); // Return memory requirement in bytes - void run(U32 input); // Execute with input - int read(Reader* in2); // Read header - bool write(Writer* out2, bool pp); // If pp write PCOMP else HCOMP header - int step(U32 input, int mode); // Trace execution (defined externally) - - Writer* output; // Destination for OUT instruction, or 0 to suppress - SHA1* sha1; // Points to checksum computer - U32 H(int i) {return h(i);} // get element of h - - void flush(); // write outbuf[0..bufptr-1] to output and sha1 - void outc(int ch) { // output byte ch (0..255) or -1 at EOS - if (ch<0 || (outbuf[bufptr]=ch, ++bufptr==outbuf.isize())) flush(); - } - - // ZPAQ1 block header - Array header; // hsize[2] hh hm ph pm n COMP (guard) HCOMP (guard) - int cend; // COMP in header[7...cend-1] - int hbegin, hend; // HCOMP/PCOMP in header[hbegin...hend-1] - -private: - // Machine state for executing HCOMP - Array m; // memory array M for HCOMP - Array h; // hash array H for HCOMP - Array r; // 256 element register array - Array outbuf; // output buffer - int bufptr; // number of bytes in outbuf - U32 a, b, c, d; // machine registers - int f; // condition flag - int pc; // program counter - int rcode_size; // length of rcode - U8* rcode; // JIT code for run() - - // Support code - int assemble(); // put JIT code in rcode - void init(int hbits, int mbits); // initialize H and M sizes - int execute(); // interpret 1 instruction, return 0 after HALT, else 1 - void run0(U32 input); // default run() if not JIT - void div(U32 x) {if (x) a/=x; else a=0;} - void mod(U32 x) {if (x) a%=x; else a=0;} - void swap(U32& x) {a^=x; x^=a; a^=x;} - void swap(U8& x) {a^=x; x^=a; a^=x;} - void err(); // exit with run time error -}; - -///////////////////////// Component ////////////////////////// - -// A Component is a context model, indirect context model, match model, -// fixed weight mixer, adaptive 2 input mixer without or with current -// partial byte as context, adaptive m input mixer (without or with), -// or SSE (without or with). - -struct Component { - size_t limit; // max count for cm - size_t cxt; // saved context - size_t a, b, c; // multi-purpose variables - Array cm; // cm[cxt] -> p in bits 31..10, n in 9..0; MATCH index - Array ht; // ICM/ISSE hash table[0..size1][0..15] and MATCH buf - Array a16; // MIX weights - void init(); // initialize to all 0 - Component() {init();} -}; - -////////////////////////// StateTable //////////////////////// - -// Next state table -class StateTable { -public: - U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1 - int next(int state, int y) { // next state for bit y - assert(state>=0 && state<256); - assert(y>=0 && y<4); - return ns[state*4+y]; - } - int cminit(int state) { // initial probability of 1 * 2^23 - assert(state>=0 && state<256); - return ((ns[state*4+3]*2+1)<<22)/(ns[state*4+2]+ns[state*4+3]+1); - } - StateTable(); -}; - -///////////////////////// Predictor ////////////////////////// - -// A predictor guesses the next bit -class Predictor { -public: - Predictor(ZPAQL&); - ~Predictor(); - void init(); // build model - int predict(); // probability that next bit is a 1 (0..4095) - void update(int y); // train on bit y (0..1) - int stat(int); // Defined externally - bool isModeled() { // n>0 components? - assert(z.header.isize()>6); - return z.header[6]!=0; - } -private: - - // Predictor state - int c8; // last 0...7 bits. - int hmap4; // c8 split into nibbles - int p[256]; // predictions - U32 h[256]; // unrolled copy of z.h - ZPAQL& z; // VM to compute context hashes, includes H, n - Component comp[256]; // the model, includes P - bool initTables; // are tables initialized? - - // Modeling support functions - int predict0(); // default - void update0(int y); // default - int dt2k[256]; // division table for match: dt2k[i] = 2^12/i - int dt[1024]; // division table for cm: dt[i] = 2^16/(i+1.5) - U16 squasht[4096]; // squash() lookup table - short stretcht[32768];// stretch() lookup table - StateTable st; // next, cminit functions - U8* pcode; // JIT code for predict() and update() - int pcode_size; // length of pcode - - // reduce prediction error in cr.cm - void train(Component& cr, int y) { - assert(y==0 || y==1); - U32& pn=cr.cm(cr.cxt); - U32 count=pn&0x3ff; - int error=y*32767-(cr.cm(cr.cxt)>>17); - pn+=(error*dt[count]&-1024)+(count floor(32768/(1+exp(-x/64))) - int squash(int x) { - assert(initTables); - assert(x>=-2048 && x<=2047); - return squasht[x+2048]; - } - - // x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash - int stretch(int x) { - assert(initTables); - assert(x>=0 && x<=32767); - return stretcht[x]; - } - - // bound x to a 12 bit signed int - int clamp2k(int x) { - if (x<-2048) return -2048; - else if (x>2047) return 2047; - else return x; - } - - // bound x to a 20 bit signed int - int clamp512k(int x) { - if (x<-(1<<19)) return -(1<<19); - else if (x>=(1<<19)) return (1<<19)-1; - else return x; - } - - // Get cxt in ht, creating a new row if needed - size_t find(Array& ht, int sizebits, U32 cxt); - - // Put JIT code in pcode - int assemble_p(); -}; - -//////////////////////////// Decoder ///////////////////////// - -// Decoder decompresses using an arithmetic code -class Decoder: public Reader { -public: - Reader* in; // destination - Decoder(ZPAQL& z); - int decompress(); // return a byte or EOF - int skip(); // skip to the end of the segment, return next byte - void init(); // initialize at start of block - int stat(int x) {return pr.stat(x);} - int get() { // return 1 byte of buffered input or EOF - if (rpos==wpos) { - rpos=0; - wpos=in ? in->read(&buf[0], BUFSIZE) : 0; - assert(wpos<=BUFSIZE); - } - return rpos buf; // input buffer of size BUFSIZE bytes - int decode(int p); // return decoded bit (0..1) with prob. p (0..65535) -}; - -/////////////////////////// PostProcessor //////////////////// - -class PostProcessor { - int state; // input parse state: 0=INIT, 1=PASS, 2..4=loading, 5=POST - int hsize; // header size - int ph, pm; // sizes of H and M in z -public: - ZPAQL z; // holds PCOMP - PostProcessor(): state(0), hsize(0), ph(0), pm(0) {} - void init(int h, int m); // ph, pm sizes of H and M - int write(int c); // Input a byte, return state - int getState() const {return state;} - void setOutput(Writer* out) {z.output=out;} - void setSHA1(SHA1* sha1ptr) {z.sha1=sha1ptr;} -}; - -//////////////////////// Decompresser //////////////////////// - -// For decompression and listing archive contents -class Decompresser { -public: - Decompresser(): z(), dec(z), pp(), state(BLOCK), decode_state(FIRSTSEG) {} - void setInput(Reader* in) {dec.in=in;} - bool findBlock(double* memptr = 0); - void hcomp(Writer* out2) {z.write(out2, false);} - bool findFilename(Writer* = 0); - void readComment(Writer* = 0); - void setOutput(Writer* out) {pp.setOutput(out);} - void setSHA1(SHA1* sha1ptr) {pp.setSHA1(sha1ptr);} - bool decompress(int n = -1); // n bytes, -1=all, return true until done - bool pcomp(Writer* out2) {return pp.z.write(out2, true);} - void readSegmentEnd(char* sha1string = 0); - int stat(int x) {return dec.stat(x);} - int buffered() {return dec.buffered();} -private: - ZPAQL z; - Decoder dec; - PostProcessor pp; - enum {BLOCK, FILENAME, COMMENT, DATA, SEGEND} state; // expected next - enum {FIRSTSEG, SEG, SKIP} decode_state; // which segment in block? -}; - -/////////////////////////// decompress() ///////////////////// - -void decompress(Reader* in, Writer* out); - -//////////////////////////// Encoder ///////////////////////// - -// Encoder compresses using an arithmetic code -class Encoder { -public: - Encoder(ZPAQL& z, int size=0): - out(0), low(1), high(0xFFFFFFFF), pr(z) {} - void init(); - void compress(int c); // c is 0..255 or EOF - int stat(int x) {return pr.stat(x);} - Writer* out; // destination -private: - U32 low, high; // range - Predictor pr; // to get p - Array buf; // unmodeled input - void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535) -}; - -//////////////////////////// Compiler //////////////////////// - -// Input ZPAQL source code with args and store the compiled code -// in hz and pz and write pcomp_cmd to out2. - -class Compiler { -public: - Compiler(const char* in, int* args, ZPAQL& hz, ZPAQL& pz, Writer* out2); -private: - const char* in; // ZPAQL source code - int* args; // Array of up to 9 args, default NULL = all 0 - ZPAQL& hz; // Output of COMP and HCOMP sections - ZPAQL& pz; // Output of PCOMP section - Writer* out2; // Output ... of "PCOMP ... ;" - int line; // Input line number for reporting errors - int state; // parse state: 0=space -1=word >0 (nest level) - - // Symbolic constants - typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE, - JT=39,JF=47,JMP=63,LJ=255, - POST=256,PCOMP,END,IF,IFNOT,ELSE,ENDIF,DO, - WHILE,UNTIL,FOREVER,IFL,IFNOTL,ELSEL,SEMICOLON} CompType; - - void syntaxError(const char* msg, const char* expected=0); // error() - void next(); // advance in to next token - bool matchToken(const char* tok);// in==token? - int rtoken(int low, int high); // return token which must be in range - int rtoken(const char* list[]); // return token by position in list - void rtoken(const char* s); // return token which must be s - int compile_comp(ZPAQL& z); // compile either HCOMP or PCOMP - - // Stack of n elements - class Stack { - libzpaq::Array s; - size_t top; - public: - Stack(int n): s(n), top(0) {} - void push(const U16& x) { - if (top>=s.size()) error("IF or DO nested too deep"); - s[top++]=x; - } - U16 pop() { - if (top<=0) error("unmatched IF or DO"); - return s[--top]; - } - }; - - Stack if_stack, do_stack; -}; - -//////////////////////// Compressor ////////////////////////// - -class Compressor { -public: - Compressor(): enc(z), in(0), state(INIT), verify(false) {} - void setOutput(Writer* out) {enc.out=out;} - void writeTag(); - void startBlock(int level); // level=1,2,3 - void startBlock(const char* hcomp); // ZPAQL byte code - void startBlock(const char* config, // ZPAQL source code - int* args, // NULL or int[9] arguments - Writer* pcomp_cmd = 0); // retrieve preprocessor command - void setVerify(bool v) {verify = v;} // check postprocessing? - void hcomp(Writer* out2) {z.write(out2, false);} - bool pcomp(Writer* out2) {return pz.write(out2, true);} - void startSegment(const char* filename = 0, const char* comment = 0); - void setInput(Reader* i) {in=i;} - void postProcess(const char* pcomp = 0, int len = 0); // byte code - bool compress(int n = -1); // n bytes, -1=all, return true until done - void endSegment(const char* sha1string = 0); - char* endSegmentChecksum(int64_t* size = 0, bool dosha1=true); - int64_t getSize() {return sha1.usize();} - const char* getChecksum() {return sha1.result();} - void endBlock(); - int stat(int x) {return enc.stat(x);} -private: - ZPAQL z, pz; // model and test postprocessor - Encoder enc; // arithmetic encoder containing predictor - Reader* in; // input source - SHA1 sha1; // to test pz output - char sha1result[20]; // sha1 output - enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state; - bool verify; // if true then test by postprocessing -}; - -/////////////////////////// StringBuffer ///////////////////// - -// For (de)compressing to/from a string. Writing appends bytes -// which can be later read. -class StringBuffer: public libzpaq::Reader, public libzpaq::Writer { - unsigned char* p; // allocated memory, not NUL terminated, may be NULL - size_t al; // number of bytes allocated, 0 iff p is NULL - size_t wpos; // index of next byte to write, wpos <= al - size_t rpos; // index of next byte to read, rpos < wpos or return EOF. - size_t limit; // max size, default = -1 - const size_t init; // initial size on first use after reset - - // Increase capacity to a without changing size - void reserve(size_t a) { - assert(!al==!p); - if (a<=al) return; - unsigned char* q=0; - if (a>0) q=(unsigned char*)(p ? realloc(p, a) : malloc(a)); - if (a>0 && !q) error("Out of memory"); - p=q; - al=a; - } - - // Enlarge al to make room to write at least n bytes. - void lengthen(size_t n) { - assert(wpos<=al); - if (wpos+n>limit || wpos+n=a) a=a*2+init; - reserve(a); - } - - // No assignment or copy - void operator=(const StringBuffer&); - StringBuffer(const StringBuffer&); - -public: - - // Direct access to data - unsigned char* data() {assert(p || wpos==0); return p;} - - // Allocate no memory initially - StringBuffer(size_t n=0): - p(0), al(0), wpos(0), rpos(0), limit(size_t(-1)), init(n>128?n:128) {} - - // Set output limit - void setLimit(size_t n) {limit=n;} - - // Free memory - ~StringBuffer() {if (p) free(p);} - - // Return number of bytes written. - size_t size() const {return wpos;} - - // Return number of bytes left to read - size_t remaining() const {return wpos-rpos;} - - // Reset size to 0 and free memory. - void reset() { - if (p) free(p); - p=0; - al=rpos=wpos=0; - } - - // Write a single byte. - void put(int c) { // write 1 byte - lengthen(1); - assert(p); - assert(wposwpos) n=wpos-rpos; - if (n>0 && buf) memcpy(buf, p+rpos, n); - rpos+=n; - return n; - } - - // Return the entire string as a read-only array. - const char* c_str() const {return (const char*)p;} - - // Truncate the string to size i. - void resize(size_t i) { - wpos=i; - if (rpos>wpos) rpos=wpos; - } - - // Swap efficiently (init is not swapped) - void swap(StringBuffer& s) { - std::swap(p, s.p); - std::swap(al, s.al); - std::swap(wpos, s.wpos); - std::swap(rpos, s.rpos); - std::swap(limit, s.limit); - } -}; - -/////////////////////////// compress() /////////////////////// - -// Compress in to out in multiple blocks. Default method is "14,128,0" -// Default filename is "". Comment is appended to input size. -// dosha1 means save the SHA-1 checksum. -void compress(Reader* in, Writer* out, const char* method, - const char* filename=0, const char* comment=0, bool dosha1=true); - -// Same as compress() but output is 1 block, ignoring block size parameter. -void compressBlock(StringBuffer* in, Writer* out, const char* method, - const char* filename=0, const char* comment=0, bool dosha1=true); - -} // namespace libzpaq - -#endif // LIBZPAQ_H diff --git a/lib/algorithm/compression/uncompressed_codec.cpp b/lib/algorithm/compression/uncompressed_codec.cpp index addd1ea..c29c74f 100644 --- a/lib/algorithm/compression/uncompressed_codec.cpp +++ b/lib/algorithm/compression/uncompressed_codec.cpp @@ -3,7 +3,7 @@ namespace tachyon{ namespace algorithm{ -const bool UncompressedCodec::compress(container_type& container){ +bool UncompressedCodec::Compress(container_type& container){ container.buffer_data.resize(container.buffer_data_uncompressed.size() + 65536); memcpy(container.buffer_data.data(), container.buffer_data_uncompressed.data(), container.buffer_data_uncompressed.size()); container.header.data_header.controller.encoder = YON_ENCODE_NONE; @@ -12,7 +12,7 @@ const bool UncompressedCodec::compress(container_type& container){ return true; } -const bool UncompressedCodec::decompress(container_type& container){ +bool UncompressedCodec::Decompress(container_type& container){ if(container.header.data_header.controller.encryption != YON_ENCRYPTION_NONE){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Data is encrypted. Provide a valid keychain and decrypt before proceeding..." << std::endl; return false; @@ -25,11 +25,11 @@ const bool UncompressedCodec::decompress(container_type& container){ container.buffer_data_uncompressed.resize(container.buffer_data.n_chars + 16536); memcpy(container.buffer_data_uncompressed.buffer, container.buffer_data.buffer, container.buffer_data.n_chars); container.buffer_data_uncompressed.n_chars = container.buffer_data.n_chars; - assert(container.checkCRC(0)); + assert(container.CheckMd5(0)); return true; } -const bool UncompressedCodec::decompressStrides(container_type& container){ +bool UncompressedCodec::DecompressStrides(container_type& container){ if(container.header.stride_header.controller.encryption != YON_ENCRYPTION_NONE){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Data is encrypted. Provide a valid keychain and decrypt before proceeding..." << std::endl; return false; @@ -48,7 +48,7 @@ const bool UncompressedCodec::decompressStrides(container_type& container){ container.buffer_strides_uncompressed.resize(container.buffer_strides.n_chars + 16536); memcpy(container.buffer_strides_uncompressed.buffer, container.buffer_strides.buffer, container.buffer_strides.n_chars); container.buffer_strides_uncompressed.n_chars = container.buffer_strides.n_chars; - assert(container.checkCRC(1)); + assert(container.CheckMd5(1)); return true; } diff --git a/lib/algorithm/compression/uncompressed_codec.h b/lib/algorithm/compression/uncompressed_codec.h index dfecb46..522819f 100644 --- a/lib/algorithm/compression/uncompressed_codec.h +++ b/lib/algorithm/compression/uncompressed_codec.h @@ -7,23 +7,21 @@ namespace tachyon{ namespace algorithm{ class UncompressedCodec : public CompressionContainer{ -private: - typedef UncompressedCodec self_type; - -protected: - typedef containers::DataContainer container_type; - typedef io::BasicBuffer buffer_type; - typedef algorithm::PermutationManager permutation_type; +public: + typedef UncompressedCodec self_type; + typedef containers::DataContainer container_type; + typedef io::BasicBuffer buffer_type; + typedef yon_gt_ppa permutation_type; public: UncompressedCodec() = default; ~UncompressedCodec() = default; - inline const bool compress(permutation_type& manager){ return true; } - const bool compress(container_type& container); - inline const bool compressStrides(container_type& container){ return true; } - const bool decompress(container_type& container); - const bool decompressStrides(container_type& container); + bool Compress(container_type& container, permutation_type& manager){ return true; } + bool Compress(container_type& container); + inline bool CompressStrides(container_type& container){ return true; } + bool Decompress(container_type& container); + bool DecompressStrides(container_type& container); protected: buffer_type buffer; diff --git a/lib/algorithm/compression/zpaq_codec.h b/lib/algorithm/compression/zpaq_codec.h deleted file mode 100644 index cbd932b..0000000 --- a/lib/algorithm/compression/zpaq_codec.h +++ /dev/null @@ -1,239 +0,0 @@ -#ifndef ALGORITHM_COMPRESSION_ZPAQ_CODEC_H_ -#define ALGORITHM_COMPRESSION_ZPAQ_CODEC_H_ - -#include "compression_container.h" -#include "zpaq_wrapper.h" - -namespace tachyon{ -namespace algorithm{ - -class ZPAQContainer : public CompressionContainer{ -private: - typedef ZPAQContainer self_type; - -public: - ZPAQContainer() : - compression_level_data(3), - compression_level_strides(3) - { - } - - virtual ~ZPAQContainer(){ } - - const bool compress(permutation_type& manager){ return false; } - - const bool compress(container_type& container, const std::string& command, const bool compress_strides = true){ - container.generateCRC(); - - if(container.header.data_header.controller.uniform || container.buffer_data_uncompressed.size() < 100){ - memcpy(container.buffer_data.data(), - container.buffer_data_uncompressed.data(), - container.buffer_data_uncompressed.size()); - container.header.data_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_data.n_chars = container.buffer_data_uncompressed.size(); - container.header.data_header.cLength = container.buffer_data_uncompressed.size(); - - if(compress_strides){ - if(container.header.data_header.hasMixedStride()) - return(this->compressStrides(container, command)); - else return true; - } else return true; - } - - container.buffer_data.reset(); - container.buffer_data.resize(container.buffer_data_uncompressed.size() + 65536); - ZpaqWrapperIn in(container.buffer_data_uncompressed); - ZpaqWrapperOut out(container.buffer_data); - - libzpaq::compress(&in, &out, &command[0]); - - const float fold = (float)container.buffer_data_uncompressed.size() / out.buffer.size(); - if(fold < MIN_COMPRESSION_FOLD){ - memcpy(container.buffer_data.data(), - container.buffer_data_uncompressed.data(), - container.buffer_data_uncompressed.size()); - container.header.data_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_data.n_chars = container.buffer_data_uncompressed.size(); - container.header.data_header.cLength = container.buffer_data_uncompressed.size(); - if(compress_strides){ - if(container.header.data_header.hasMixedStride()) - return(this->compressStrides(container, command)); - else return true; - } else return true; - } - - container.header.data_header.cLength = out.buffer.size(); - container.header.data_header.controller.encoder = YON_ENCODE_ZPAQ; - - if(compress_strides){ - if(container.header.data_header.hasMixedStride()) - return(this->compressStrides(container, command)); - else return true; - } else return true; - } - - const bool compress(container_type& container){ - return(this->compress(container, true)); - } - - const bool compress(container_type& container, const bool compress_strides){ - container.generateCRC(); - - if(container.header.data_header.controller.uniform || container.buffer_data_uncompressed.size() < 100){ - memcpy(container.buffer_data.data(), - container.buffer_data_uncompressed.data(), - container.buffer_data_uncompressed.size()); - container.header.data_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_data.n_chars = container.buffer_data_uncompressed.size(); - container.header.data_header.cLength = container.buffer_data_uncompressed.size(); - - if(compress_strides){ - if(container.header.data_header.hasMixedStride()) - return(this->compressStrides(container)); - else return true; - } else return true; - //return true; - } - - container.buffer_data.reset(); - container.buffer_data.resize(container.buffer_data_uncompressed.size() + 65536); - ZpaqWrapperIn in(container.buffer_data_uncompressed); - ZpaqWrapperOut out(container.buffer_data); - libzpaq::compress(&in, &out, "x0.3ci1"); - - const float fold = (float)container.buffer_data_uncompressed.size() / out.buffer.size(); - if(fold < MIN_COMPRESSION_FOLD){ - memcpy(container.buffer_data.data(), - container.buffer_data_uncompressed.data(), - container.buffer_data_uncompressed.size()); - container.header.data_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_data.n_chars = container.buffer_data_uncompressed.size(); - container.header.data_header.cLength = container.buffer_data_uncompressed.size(); - - if(compress_strides){ - if(container.header.data_header.hasMixedStride()) - return(this->compressStrides(container)); - else return true; - } else return true; - } - - container.header.data_header.cLength = out.buffer.size(); - container.header.data_header.controller.encoder = YON_ENCODE_ZPAQ; - - if(compress_strides){ - if(container.header.data_header.hasMixedStride()) - return(this->compressStrides(container)); - else return true; - } else return true; - } - - const bool compressStrides(container_type& container, const std::string& command){ - if(container.header.stride_header.controller.uniform || container.buffer_strides_uncompressed.size() < 100){ - memcpy(container.buffer_strides.data(), container.buffer_strides_uncompressed.data(), container.buffer_strides_uncompressed.size()); - container.header.stride_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_strides.n_chars = container.buffer_strides_uncompressed.size(); - container.header.stride_header.cLength = container.buffer_strides_uncompressed.size(); - return true; - } - - container.buffer_strides.reset(); - ZpaqWrapperIn in(container.buffer_strides_uncompressed); - ZpaqWrapperOut out(container.buffer_strides); - out.buffer.resize(in.buffer.size() + 65536); - libzpaq::compress(&in, &out, &command[0]); - - const float fold = (float)container.buffer_strides_uncompressed.size()/out.buffer.size(); - if(fold < MIN_COMPRESSION_FOLD){ - memcpy(container.buffer_strides.data(), container.buffer_strides_uncompressed.data(), container.buffer_strides_uncompressed.size()); - container.header.stride_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_strides.n_chars = container.buffer_strides_uncompressed.size(); - container.header.stride_header.cLength = container.buffer_strides_uncompressed.size(); - return true; - } - - //std::cerr << utility::timestamp("LOG","COMPRESSION-STRIDE") << "Input: " << container.buffer_strides_uncompressed.n_chars << " and output: " << ret << " -> " << (float)container.buffer_strides_uncompressed.n_chars/ret << "-fold" << std::endl; - - container.header.stride_header.cLength = out.buffer.size(); - container.header.stride_header.controller.encoder = YON_ENCODE_ZPAQ; - - return true; - } - - const bool compressStrides(container_type& container){ - if(container.header.stride_header.controller.uniform || container.buffer_strides_uncompressed.size() < 100){ - memcpy(container.buffer_strides.data(), container.buffer_strides_uncompressed.data(), container.buffer_strides_uncompressed.size()); - container.header.stride_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_strides.n_chars = container.buffer_strides_uncompressed.size(); - container.header.stride_header.cLength = container.buffer_strides_uncompressed.size(); - return true; - } - - container.buffer_strides.reset(); - ZpaqWrapperIn in(container.buffer_strides_uncompressed); - ZpaqWrapperOut out(container.buffer_strides); - out.buffer.resize(in.buffer.size() + 65536); - libzpaq::compress(&in, &out, "x0.3ci1"); - - const float fold = (float)container.buffer_strides_uncompressed.size()/out.buffer.size(); - if(fold < MIN_COMPRESSION_FOLD){ - memcpy(container.buffer_strides.data(), container.buffer_strides_uncompressed.data(), container.buffer_strides_uncompressed.size()); - container.header.stride_header.controller.encoder = YON_ENCODE_NONE; - container.buffer_strides.n_chars = container.buffer_strides_uncompressed.size(); - container.header.stride_header.cLength = container.buffer_strides_uncompressed.size(); - return true; - } - - //std::cerr << utility::timestamp("LOG","COMPRESSION-STRIDE") << "Input: " << container.buffer_strides_uncompressed.n_chars << " and output: " << ret << " -> " << (float)container.buffer_strides_uncompressed.n_chars/ret << "-fold" << std::endl; - - container.header.stride_header.cLength = out.buffer.size(); - container.header.stride_header.controller.encoder = YON_ENCODE_ZPAQ; - - return true; - } - - const bool decompress(container_type& container){ - if(container.header.data_header.controller.encoder != YON_ENCODE_ZPAQ){ - return true; - } - - container.buffer_data_uncompressed.reset(); - container.buffer_data_uncompressed.resize(container.header.data_header.uLength + 65536); - ZpaqWrapperIn in(container.buffer_data); - ZpaqWrapperOut out(container.buffer_data_uncompressed); - libzpaq::decompress(&in, &out); - std::cerr << "zpaq decode: " << container.buffer_data.size() << "->" << container.buffer_data_uncompressed.size() << std::endl; - - assert(out.buffer.size() == container.header.data_header.uLength); - assert(container.checkCRC(0)); - - return true; - } - const bool decompressStrides(container_type& container){ - if(container.header.stride_header.controller.encoder != YON_ENCODE_ZPAQ){ - return true; - } - - container.buffer_strides_uncompressed.reset(); - container.buffer_strides_uncompressed.resize(container.header.stride_header.uLength + 65536); - ZpaqWrapperIn in(container.buffer_strides); - ZpaqWrapperOut out(container.buffer_strides_uncompressed); - libzpaq::decompress(&in, &out); - assert(out.buffer.size() == container.header.stride_header.uLength); - assert(container.checkCRC(1)); - - return true; - } - -protected: - int compression_level_data; - int compression_level_strides; - std::string compression_level_data_string; - std::string compression_level_strides_string; -}; - -} -} - - - -#endif /* ALGORITHM_COMPRESSION_ZPAQ_CODEC_H_ */ diff --git a/lib/algorithm/compression/zpaq_wrapper.h b/lib/algorithm/compression/zpaq_wrapper.h deleted file mode 100644 index b057174..0000000 --- a/lib/algorithm/compression/zpaq_wrapper.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef ALGORITHM_COMPRESSION_ZPAQ_WRAPPER_H_ -#define ALGORITHM_COMPRESSION_ZPAQ_WRAPPER_H_ - -#include "libzpaq.h" -#include "io/basic_buffer.h" - -namespace tachyon{ -namespace algorithm{ - -class ZpaqWrapperIn: public libzpaq::Reader { -private: - typedef io::BasicBuffer buffer_type; - -public: - ZpaqWrapperIn(buffer_type& buffer) : iterator_pos(0), buffer(buffer){} - ~ZpaqWrapperIn(){ } - inline int get(){ - if(this->iterator_pos + 1 == this->buffer.size()) return(-1); // eof - return((BYTE)this->buffer[this->iterator_pos++]); - getchar(); - } // returns byte 0..255 or -1 at EOF - - inline void reset(void){ - this->buffer.reset(); - this->iterator_pos = 0; - } - -public: - size_t iterator_pos; - buffer_type& buffer; - }; - - class ZpaqWrapperOut: public libzpaq::Writer { - private: - typedef io::BasicBuffer buffer_type; - - public: - ZpaqWrapperOut(buffer_type& buffer) : buffer(buffer){} - ~ZpaqWrapperOut(){ } - inline void put(int c){ this->buffer += (BYTE)c; } // writes 1 byte 0..255 - //inline void write(const char* buf, int n){ this->buffer.Add(buf, n); } - inline void reset(void){ this->buffer.reset(); } - - public: - buffer_type& buffer; -}; - -} -} - -#endif /* ALGORITHM_COMPRESSION_ZPAQ_WRAPPER_H_ */ diff --git a/lib/algorithm/compression/zstd_codec.cpp b/lib/algorithm/compression/zstd_codec.cpp index 41c8dfc..e7b68ca 100644 --- a/lib/algorithm/compression/zstd_codec.cpp +++ b/lib/algorithm/compression/zstd_codec.cpp @@ -1,5 +1,7 @@ #include "zstd_codec.h" +#include "algorithm/digest/variant_digest_manager.h" + namespace tachyon{ namespace algorithm{ @@ -17,9 +19,47 @@ ZSTDCodec::~ZSTDCodec(){ ZSTD_freeDCtx(this->decompression_context_); } -const bool ZSTDCodec::compress(container_type& container){ - container.generateCRC(); +bool ZSTDCodec::Compress(const io::BasicBuffer& src, io::BasicBuffer& dst, const int compression_level){ + dst.reset(); + dst.resize(src.size() + 65536); + const size_t ret = ZSTD_compress( + dst.data(), + dst.capacity(), + src.data(), + src.size(), + compression_level); + + //std::cerr << utility::timestamp("LOG","COMPRESSION") << "Input: " << src.size() << " and output: " << ret << " -> " << (float)src.size()/ret << "-fold" << std::endl; + + if(ZSTD_isError(ret)){ + std::cerr << utility::timestamp("ERROR","ZSTD") << ZSTD_getErrorString(ZSTD_getErrorCode(ret)) << std::endl; + return(false); + } + dst.n_chars = ret; + + return true; +} +bool ZSTDCodec::Decompress(const io::BasicBuffer& src, io::BasicBuffer& dst){ + const size_t ret = ZSTD_decompress( + dst.data(), + dst.capacity(), + src.data(), + src.size()); + + //std::cerr << utility::timestamp("LOG","COMPRESSION") << "Input: " << src.size() << " and output: " << ret << " -> " << (float)ret/src.size() << "-fold" << std::endl; + + if(ZSTD_isError(ret)){ + std::cerr << utility::timestamp("ERROR","ZSTD") << ZSTD_getErrorString(ZSTD_getErrorCode(ret)) << std::endl; + return(false); + } + + dst.n_chars = ret; + + return true; +} + +bool ZSTDCodec::Compress(container_type& container){ if(container.header.n_entries == 0){ container.header.data_header.controller.encoder = YON_ENCODE_NONE; container.buffer_data.n_chars = 0; @@ -34,8 +74,10 @@ const bool ZSTDCodec::compress(container_type& container){ container.header.data_header.cLength = container.buffer_data_uncompressed.size(); container.header.data_header.uLength = container.buffer_data_uncompressed.size(); + container.GenerateMd5(); + if(container.header.data_header.controller.mixedStride == true) - return(this->compressStrides(container)); + return(this->CompressStrides(container)); else return true; } @@ -64,8 +106,10 @@ const bool ZSTDCodec::compress(container_type& container){ container.header.data_header.cLength = container.buffer_data_uncompressed.size(); container.header.data_header.uLength = container.buffer_data_uncompressed.size(); + container.GenerateMd5(); + if(container.header.data_header.controller.mixedStride == true) - return(this->compressStrides(container)); + return(this->CompressStrides(container)); else return true; } @@ -76,12 +120,14 @@ const bool ZSTDCodec::compress(container_type& container){ container.header.data_header.cLength = container.buffer_data.size(); container.header.data_header.uLength = container.buffer_data_uncompressed.size(); + container.GenerateMd5(); + if(container.header.data_header.controller.mixedStride == true) - return(this->compressStrides(container)); + return(this->CompressStrides(container)); else return true; } -const bool ZSTDCodec::compressStrides(container_type& container){ +bool ZSTDCodec::CompressStrides(container_type& container){ if(container.header.stride_header.controller.uniform || container.buffer_strides_uncompressed.size() < 100){ memcpy(container.buffer_strides.data(), container.buffer_strides_uncompressed.data(), container.buffer_strides_uncompressed.size()); container.header.stride_header.controller.encoder = YON_ENCODE_NONE; @@ -128,26 +174,20 @@ const bool ZSTDCodec::compressStrides(container_type& container){ return true; } -const bool ZSTDCodec::compress(permutation_type& manager){ - if(manager.PPA.size() == 0) +bool ZSTDCodec::Compress(container_type& container, permutation_type& manager){ + if(manager.n_samples == 0) return true; - U32 n_samples = manager.n_samples; - if(n_samples < 100){ - n_samples = 100; - manager.PPA.resize(n_samples*sizeof(U32)); - } + container.buffer_data_uncompressed.reset(); + container.buffer_data_uncompressed.resize(manager.n_samples*sizeof(U32) + 65536); + container.buffer_data_uncompressed << manager; this->buffer.reset(); - this->buffer.resize(n_samples*sizeof(U32) + 65536); - - U32 crc = crc32(0, NULL, 0); - manager.header.data_header.crc = crc32(crc, (Bytef*)manager.PPA.data(), manager.PPA.size()); - manager.header.data_header.uLength = manager.PPA.size(); + this->buffer.resize(manager.n_samples*sizeof(U32) + 65536); //const U32 in = manager.PPA.n_chars; - const int p_ret = permuteIntBits(manager.PPA.data(), - manager.PPA.size(), + const int p_ret = permuteIntBits(container.buffer_data_uncompressed.data(), + container.buffer_data_uncompressed.size(), this->buffer.data()); this->buffer.n_chars = p_ret; @@ -170,8 +210,10 @@ const bool ZSTDCodec::compress(permutation_type& manager){ */ - size_t ret = ZSTD_compress(manager.PPA.data(), - manager.PPA.capacity(), + container.buffer_data.reset(); + container.buffer_data.resize(p_ret + 65536); + size_t ret = ZSTD_compress(container.buffer_data.data(), + container.buffer_data.capacity(), this->buffer.data(), this->buffer.size(), this->compression_level_data); @@ -180,18 +222,22 @@ const bool ZSTDCodec::compress(permutation_type& manager){ if(ZSTD_isError(ret)){ std::cerr << "error zstd permute_ : " << ZSTD_getErrorCode(ret) << std::endl; std::cerr << ZSTD_getErrorName(ret) << std::endl; - std::cerr << this->buffer.n_chars << '\t' << manager.PPA.n_chars << std::endl; + std::cerr << this->buffer.n_chars << '\t' << container.buffer_data.size() << std::endl; exit(1); } + //std::cerr << utility::timestamp("LOG","COMPRESSION") << "PPA in: " << this->buffer.n_chars << " and out: " << ret << std::endl; - manager.PPA.n_chars = ret; - manager.header.data_header.uLength = this->buffer.n_chars; - manager.header.data_header.cLength = ret; + + container.buffer_data.n_chars = ret; + container.header.data_header.cLength = container.buffer_data.size(); + container.header.data_header.uLength = container.buffer_data_uncompressed.size(); + container.header.data_header.controller.encoder = YON_ENCODE_ZSTD; + container.GenerateMd5(); return true; } -const bool ZSTDCodec::decompress(container_type& container){ +bool ZSTDCodec::Decompress(container_type& container){ if(container.header.data_header.controller.encryption != YON_ENCRYPTION_NONE){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Data is encrypted. Provide a valid keychain and decrypt before proceeding..." << std::endl; return false; @@ -218,12 +264,12 @@ const bool ZSTDCodec::decompress(container_type& container){ assert(ret >= 0); container.buffer_data_uncompressed.n_chars = ret; assert((U32)ret == container.header.data_header.uLength); - assert(container.checkCRC(0)); + assert(container.CheckMd5(0)); return true; } -const bool ZSTDCodec::decompressStrides(container_type& container){ +bool ZSTDCodec::DecompressStrides(container_type& container){ if(container.header.stride_header.controller.encryption != YON_ENCRYPTION_NONE){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Data is encrypted. Provide a valid keychain and decrypt before proceeding..." << std::endl; return false; @@ -253,29 +299,26 @@ const bool ZSTDCodec::decompressStrides(container_type& container){ container.buffer_strides_uncompressed.n_chars = ret_stride; assert((U32)ret_stride == container.header.stride_header.uLength); //std::cerr << "ENCODE_ZSTD | STRIDE | CRC check " << (container.checkCRC(0) ? "PASS" : "FAIL") << std::endl; - assert(container.checkCRC(1)); + assert(container.CheckMd5(1)); return true; } -const bool ZSTDCodec::decompress(permutation_type& manager){ +bool ZSTDCodec::Decompress(container_type& container, permutation_type& manager){ this->buffer.reset(); - U32 n_samples = manager.n_samples; - if(n_samples < 100){ - n_samples = 100; - manager.PPA.resize(n_samples*sizeof(U32)); - } - this->buffer.resize(n_samples*sizeof(U32) + 65536); + container.buffer_data_uncompressed.reset(); + container.buffer_data_uncompressed.resize(manager.n_samples*sizeof(U32) + 65536); + this->buffer.resize(manager.n_samples*sizeof(U32) + 65536); size_t ret = ZSTD_decompress(this->buffer.data(), this->buffer.capacity(), - manager.PPA.data(), - manager.PPA.size()); + container.buffer_data.data(), + container.buffer_data.size()); if(ZSTD_isError(ret)){ std::cerr << "error zstd permute_ : " << ZSTD_getErrorCode(ret) << std::endl; std::cerr << ZSTD_getErrorName(ret) << std::endl; - std::cerr << this->buffer.n_chars << '\t' << manager.PPA.n_chars << std::endl; + std::cerr << this->buffer.n_chars << '\t' << container.buffer_data_uncompressed.n_chars << std::endl; exit(1); } @@ -283,14 +326,16 @@ const bool ZSTDCodec::decompress(permutation_type& manager){ //manager.PPA.n_chars = ret; //manager.c_length = ret; - manager.PPA.resize(ret + 16536); + //container.buffer_data_uncompressed.resize(ret + 16536); const int up_ret = unpermuteIntBits(this->buffer.data(), ret, - manager.PPA.data()); + container.buffer_data_uncompressed.data()); - //std::cerr << "ret: " << up_ret << std::endl; + //std::cerr << "ret: " << up_ret << std::endl; //memcpy(manager.PPA.buffer, this->buffer.data(), up_ret); - manager.PPA.n_chars = up_ret; + container.buffer_data_uncompressed.n_chars = up_ret; + container.buffer_data_uncompressed >> manager; + return true; } diff --git a/lib/algorithm/compression/zstd_codec.h b/lib/algorithm/compression/zstd_codec.h index ef160b3..55ff0bc 100644 --- a/lib/algorithm/compression/zstd_codec.h +++ b/lib/algorithm/compression/zstd_codec.h @@ -18,16 +18,19 @@ class ZSTDCodec : public CompressionContainer{ ZSTDCodec(); ~ZSTDCodec(); - inline void setCompressionLevel(const S32& c){ this->compression_level_data = c; this->compression_level_strides = c; } - inline void setCompressionLevelData(const S32& c){ this->compression_level_data = c; } - inline void setCompressionLevelStrides(const S32& c){ this->compression_level_strides = c; } - - const bool compress(container_type& container); - const bool compressStrides(container_type& container); - const bool compress(permutation_type& manager); - const bool decompress(container_type& container); - const bool decompressStrides(container_type& container); - const bool decompress(permutation_type& manager); + inline void SetCompressionLevel(const S32& c){ this->compression_level_data = c; this->compression_level_strides = c; } + inline void SetCompressionLevelData(const S32& c){ this->compression_level_data = c; } + inline void SetCompressionLevelStrides(const S32& c){ this->compression_level_strides = c; } + + bool Compress(container_type& container); + bool CompressStrides(container_type& container); + bool Compress(container_type& container, permutation_type& manager); + bool Decompress(container_type& container); + bool DecompressStrides(container_type& container); + bool Decompress(container_type& container, permutation_type& manager); + + bool Compress(const io::BasicBuffer& src, io::BasicBuffer& dst, const int compression_level); + bool Decompress(const io::BasicBuffer& src, io::BasicBuffer& dst); private: S32 compression_level_data; diff --git a/lib/algorithm/digest/digest.h b/lib/algorithm/digest/digest.h index dfa7672..0462b2b 100644 --- a/lib/algorithm/digest/digest.h +++ b/lib/algorithm/digest/digest.h @@ -64,11 +64,11 @@ struct DigitalDigest{ inline bool update(const buffer_type& data_buffer, const buffer_type& stride_buffer, const bool has_strides = true){ if(!this->hasInitialized) this->initialize(); - if(!SHA512_Update(&this->data_context, (BYTE*)data_buffer.data(), data_buffer.size())) + if(!SHA512_Update(&this->data_context, (const BYTE*)data_buffer.data(), data_buffer.size())) return false; if(has_strides){ - if(!SHA512_Update(&this->stride_context, (BYTE*)stride_buffer.data(), stride_buffer.size())) + if(!SHA512_Update(&this->stride_context, (const BYTE*)stride_buffer.data(), stride_buffer.size())) return false; } return true; diff --git a/lib/algorithm/digest/digest_manager.h b/lib/algorithm/digest/digest_manager.h index a1e0ea5..7b79454 100644 --- a/lib/algorithm/digest/digest_manager.h +++ b/lib/algorithm/digest/digest_manager.h @@ -94,7 +94,7 @@ class DigestManager{ inline const_reference back(void) const{ return(this->__entries[this->n_entries_ - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries_ == 0); } + inline bool empty(void) const{ return(this->n_entries_ == 0); } inline const size_type& size(void) const{ return(this->n_entries_); } inline const size_type& capacity(void) const{ return(this->n_capacity_); } diff --git a/lib/algorithm/digest/variant_digest_manager.cpp b/lib/algorithm/digest/variant_digest_manager.cpp index 87e444c..05d9132 100644 --- a/lib/algorithm/digest/variant_digest_manager.cpp +++ b/lib/algorithm/digest/variant_digest_manager.cpp @@ -54,35 +54,17 @@ VariantDigestManager::~VariantDigestManager(){ delete [] this->__entries_format; } -void VariantDigestManager::finalize(void){ +void VariantDigestManager::Finalize(void){ parent_type::finalize(); - for(U32 i = 0; i < this->n_capacity_info_; ++i) this->atINFO(i).finalize(); + for(U32 i = 0; i < this->n_capacity_info_; ++i) this->atINFO(i).finalize(); for(U32 i = 0; i < this->n_capacity_format ; ++i) this->atFORMAT(i).finalize(); } void VariantDigestManager::operator+=(const variant_block_type& block){ - this->at(1) += block.meta_contig_container; - this->at(2) += block.meta_positions_container; - this->at(3) += block.meta_names_container; - this->at(4) += block.meta_refalt_container; - this->at(5) += block.meta_controller_container; - this->at(6) += block.meta_quality_container; - this->at(7) += block.meta_names_container; - this->at(8) += block.meta_alleles_container; - this->at(9) += block.meta_info_map_ids; - this->at(10) += block.meta_format_map_ids; - this->at(11) += block.meta_filter_map_ids; - this->at(12) += block.gt_support_data_container; - this->at(13) += block.gt_rle8_container; - this->at(14) += block.gt_rle16_container; - this->at(15) += block.gt_rle32_container; - this->at(16) += block.gt_rle64_container; - this->at(17) += block.gt_simple8_container; - this->at(18) += block.gt_simple16_container; - this->at(19) += block.gt_simple32_container; - this->at(20) += block.gt_simple64_container; + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i) + this->at(i) += block.base_containers[i]; - for(U32 i = 0; i < block.footer.n_info_streams; ++i) this->__entries_info[block.footer.info_offsets[i].data_header.global_key] += block.info_containers[i]; + for(U32 i = 0; i < block.footer.n_info_streams; ++i) this->__entries_info[block.footer.info_offsets[i].data_header.global_key] += block.info_containers[i]; for(U32 i = 0; i < block.footer.n_format_streams; ++i) this->__entries_format[block.footer.format_offsets[i].data_header.global_key] += block.format_containers[i]; } diff --git a/lib/algorithm/digest/variant_digest_manager.h b/lib/algorithm/digest/variant_digest_manager.h index 560b075..d134bca 100644 --- a/lib/algorithm/digest/variant_digest_manager.h +++ b/lib/algorithm/digest/variant_digest_manager.h @@ -1,6 +1,7 @@ #ifndef ALGORITHM_DIGEST_VARIANT_DIGEST_MANAGER_H_ #define ALGORITHM_DIGEST_VARIANT_DIGEST_MANAGER_H_ +#include "openssl/md5.h" #include "digest_manager.h" namespace tachyon{ @@ -19,7 +20,7 @@ class VariantDigestManager : public DigestManager{ VariantDigestManager(const self_type& other); ~VariantDigestManager(); - void finalize(void); + void Finalize(void); inline const_reference atINFO(const U32 position) const{ return(this->__entries_info[position]); } inline const_reference atFORMAT(const U32 position) const{ return(this->__entries_format[position]); } @@ -28,6 +29,23 @@ class VariantDigestManager : public DigestManager{ void operator+=(const variant_block_type& block); + template + static void GenerateMd5(const T& value, uint8_t* dst){ + // uint8_t hash[MD5_DIGEST_LENGTH]; + MD5_CTX md5; + MD5_Init(&md5); + MD5_Update(&md5, value, sizeof(T)); + MD5_Final(dst, &md5); + } + + static void GenerateMd5(const char* data, const uint32_t l_data, uint8_t* dst){ + // uint8_t hash[MD5_DIGEST_LENGTH]; + MD5_CTX md5; + MD5_Init(&md5); + MD5_Update(&md5, data, l_data); + MD5_Final(dst, &md5); + } + friend std::ostream& operator<<(std::ostream& out, const self_type& container){ const parent_type* const parent = reinterpret_cast(&container); out << *parent; diff --git a/lib/algorithm/encryption/encryption_decorator.cpp b/lib/algorithm/encryption/encryption_decorator.cpp index 81a9c0c..ef457f8 100644 --- a/lib/algorithm/encryption/encryption_decorator.cpp +++ b/lib/algorithm/encryption/encryption_decorator.cpp @@ -11,25 +11,13 @@ bool EncryptionDecorator::encrypt(variant_block_type& block, keychain_type& keyc } bool EncryptionDecorator::decryptAES256(variant_block_type& block, keychain_type& keychain){ - if(!this->decryptAES256(block.meta_contig_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_positions_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_refalt_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_controller_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_quality_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_names_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_rle8_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_rle16_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_rle32_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_rle64_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_alleles_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_simple8_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_simple16_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_simple32_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_simple64_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.gt_support_data_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_info_map_ids, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_filter_map_ids, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } - if(!this->decryptAES256(block.meta_format_map_ids, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } + + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i){ + if(!this->decryptAES256(block.base_containers[i], keychain)){ + std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; + return false; + } + } for(U32 i = 0; i < block.footer.n_info_streams; ++i){ if(!this->decryptAES256(block.info_containers[i], keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to decrypt!" << std::endl; return false; } @@ -45,28 +33,14 @@ bool EncryptionDecorator::decryptAES256(variant_block_type& block, keychain_type bool EncryptionDecorator::encryptAES256(variant_block_type& block, keychain_type& keychain){ BYTE RANDOM_BYTES[32]; RAND_bytes(&RANDOM_BYTES[0], 32); - block.header.blockID = XXH64(&RANDOM_BYTES[0], 32, 1337); - - if(!this->encryptAES256(block.meta_contig_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_positions_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_refalt_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_controller_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_quality_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_names_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_rle8_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_rle16_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_rle32_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_rle64_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_alleles_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_simple8_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_simple16_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_simple32_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_simple64_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.gt_support_data_container, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_info_map_ids, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_filter_map_ids, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } - if(!this->encryptAES256(block.meta_format_map_ids, keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } + block.header.block_hash = XXH64(&RANDOM_BYTES[0], 32, 1337); + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i){ + if(!this->encryptAES256(block.base_containers[i], keychain)){ + std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; + return false; + } + } for(U32 i = 0; i < block.footer.n_info_streams; ++i){ if(!this->encryptAES256(block.info_containers[i], keychain)){ std::cerr << utility::timestamp("ERROR","ENCRYPTION") << "Failed to encrypt!" << std::endl; return false; } } diff --git a/lib/algorithm/encryption/keychain.h b/lib/algorithm/encryption/keychain.h index 74d9443..61f7011 100644 --- a/lib/algorithm/encryption/keychain.h +++ b/lib/algorithm/encryption/keychain.h @@ -75,7 +75,7 @@ class Keychain{ inline const_reference back(void) const{ return(this->entries_[this->n_entries_ - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries_ == 0); } + inline bool empty(void) const{ return(this->n_entries_ == 0); } inline const size_type& size(void) const{ return(this->n_entries_); } inline const size_type& capacity(void) const{ return(this->n_capacity_); } @@ -159,6 +159,9 @@ class Keychain{ template Keychain::Keychain() : + version_major_(0), + version_minor_(0), + version_release_(0), n_entries_(0), n_capacity_(100000), entries_(new value_type[this->n_capacity_]), @@ -168,6 +171,9 @@ Keychain::Keychain() : template Keychain::Keychain(const U32 start_capacity) : + version_major_(0), + version_minor_(0), + version_release_(0), n_entries_(0), n_capacity_(start_capacity), entries_(new value_type[this->n_capacity_]), @@ -176,6 +182,9 @@ Keychain::Keychain(const U32 start_capacity) : template Keychain::Keychain(const self_type& other) : + version_major_(other.version_major_), + version_minor_(other.version_minor_), + version_release_(other.version_release_), n_entries_(other.n_entries_), n_capacity_(other.n_capacity_), entries_(new value_type[this->n_capacity_]), diff --git a/lib/algorithm/encryption/keychain_key.h b/lib/algorithm/encryption/keychain_key.h index 0f52a39..3118dc3 100644 --- a/lib/algorithm/encryption/keychain_key.h +++ b/lib/algorithm/encryption/keychain_key.h @@ -96,9 +96,9 @@ struct KeychainKeyGCM : public KeychainKey{ friend std::ostream& operator<<(std::ostream& stream, const self_type& key){ stream.write(reinterpret_cast(&key.field_id), sizeof(U64)); stream.write(reinterpret_cast(&key.encryption_type), sizeof(BYTE)); - stream.write((char*)&key.key[0], KeyLength); - stream.write((char*)&key.iv[0], IVLength); - stream.write((char*)&key.tag[0], TagLength); + stream.write((const char*)&key.key[0], KeyLength); + stream.write((const char*)&key.iv[0], IVLength); + stream.write((const char*)&key.tag[0], TagLength); return(stream); } diff --git a/lib/algorithm/permutation/genotype_sorter.cpp b/lib/algorithm/permutation/genotype_sorter.cpp new file mode 100644 index 0000000..162de31 --- /dev/null +++ b/lib/algorithm/permutation/genotype_sorter.cpp @@ -0,0 +1,211 @@ +#include "genotype_sorter.h" +#include "third_party/xxhash/xxhash.h" + +namespace tachyon { +namespace algorithm { + +GenotypeSorter::GenotypeSorter() : + n_samples(0), + gt_pattern(nullptr) +{ + memset(this->gt_remap, 0, 256*sizeof(uint8_t)); +} + +GenotypeSorter::~GenotypeSorter(){ + delete [] this->gt_pattern; +} + +void GenotypeSorter::SetSamples(const U64 n_samples){ + this->n_samples = n_samples; + this->permutation_array.Allocate(n_samples); + this->gt_pattern = new yon_radix_gt[n_samples]; +} + +void GenotypeSorter::reset(void){ + this->permutation_array.reset(); +} + +bool GenotypeSorter::Build(const vcf_container_type& vcf_container, io::VcfHeader& vcf_header){ + if(this->GetNumberSamples() == 0) + return true; + + // Reset the permutation array to [0, n_samples). + this->permutation_array.reset(); + + // Allocate tetraploid worth of memory in the first instance. + int32_t largest_ploidy = 0; + uint32_t largest_n_alleles = 0; + uint32_t n_valid_records = 0; + for(U32 i = 0; i < vcf_container.sizeWithoutCarryOver(); ++i){ + if(vcf_container[i]->n_fmt == 0) continue; + + // Perform these actions if FORMAT:GT data is available. + const int& hts_format_key = vcf_container.at(i)->d.fmt[0].id; // htslib IDX value + if(vcf_header.GetFormat(hts_format_key)->id != "GT"){ + continue; + } + + ++n_valid_records; + largest_ploidy = std::max(vcf_container[i]->d.fmt[0].n, largest_ploidy); + largest_n_alleles = std::max((uint32_t)vcf_container[i]->n_allele + 2, largest_n_alleles); + } + + // If there are no valid FORMAT:GT entries in the list then + // return false. + if(n_valid_records == 0) + return false; + + // Shift largest_n_alleles one bit to the left as this is how + // alleles are stored in Vcf as the first bit encodes for + // the phasing. The phasing may be unphased (0) or phased (1). + // Add a count of one to the largest alleles as the count should + // be fully inclusive [0, n_alleles]. + // Add a count of one to the shifted value to represent the phased + // case as described above. + const uint32_t largest_n_alleles_binary = (((largest_n_alleles + 1) << 1) + 1); + const uint8_t shift_size = ceil(log2(largest_n_alleles_binary)); + assert(shift_size * largest_ploidy <= 64); + + // Ascertain that enough memory has been allocated. + if(largest_ploidy >= this->gt_pattern[0].n_allocated){ + for(U32 i = 0; i < this->GetNumberSamples(); ++i) + this->gt_pattern[i].resize(largest_ploidy + 3); + } + + // Map a genotype such that missing and sentinel node symbol (EOV) + // is stored in the back of the order. + for(U32 i = 1; i <= largest_n_alleles; ++i) this->gt_remap[i] = i; + this->gt_remap[0] = largest_n_alleles - 1; // Missing value. + this->gt_remap[64] = largest_n_alleles; // Sentinel node symbol in unsigned space (129 >> 1 = 64). + + // In order to keep track of bins that are non-empty we use a + // vector of pointers to the bins and a map from bins to the vector + // offsets. The map uses the hash of the alleles as the key. + std::vector< std::vector > bin_used; + std::unordered_map bin_used_map; + // The byte-packed integer is used to determine the relative sort + // order of the bins. + std::vector bin_used_packed_integer; + // At the end of constructing the vectors of potential genotypic + // bins there is no guarantee at all that they are in order. Thus + // we sort a tuple (bit-packed integer, and incremental order) and + // sort on the bit-packed integer and merge bins with help of the + // original array ordering. + std::vector< std::pair > sort_helper; + + // Recycle iterator objects because the constructors appears to be + // relatively expensive. + std::unordered_map::const_iterator it; + std::unordered_map::const_iterator end; + + // Iterate over all available bcf1_t records in the container. + for(U32 i = 0; i < vcf_container.sizeWithoutCarryOver(); ++i){ + if(vcf_container[i]->n_fmt == 0) continue; + + // Perform these actions if FORMAT:GT data is available. + const int& hts_format_key = vcf_container.at(i)->d.fmt[0].id; // htslib IDX value + if(vcf_header.GetFormat(hts_format_key)->id != "GT"){ + continue; + } + + // Setup. + const bcf1_t* bcf = vcf_container[i]; + const uint8_t* gt = bcf->d.fmt[0].p; + const uint32_t base_ploidy = bcf->d.fmt[0].n; + assert(bcf->d.fmt[0].p_len == sizeof(int8_t) * base_ploidy * this->GetNumberSamples()); + + // Keep track of buffer position. + U32 gt_offset = 0; + + // Iterate over all available samples. + for(U32 s = 0; s < this->GetNumberSamples(); ++s){ + this->gt_pattern[s].n_ploidy = base_ploidy; + this->gt_pattern[s].id = s; + assert(base_ploidy < gt_pattern[s].n_allocated); + // Iterate over the ploidy for this sample and update + // the allele for that chromosome in the pattern helper + // structure. + for(U32 a = 0; a < base_ploidy; ++a, ++gt_offset){ + const uint8_t repacked = (this->gt_remap[gt[gt_offset] >> 1] << 1) | (gt[gt_offset] & 1); + assert((repacked >> 1) <= largest_n_alleles); + assert(repacked < largest_n_alleles_binary); + this->gt_pattern[s].alleles[a] = repacked; + } + } + assert(gt_offset == bcf->d.fmt[0].p_len); + + // Iterate over all encoded genotypes and assign them + // to different bins according to their bitpacked values. + for(U32 s = 0; s < this->GetNumberSamples(); ++s){ + // Hash the pattern of alleles + const U64 hash_pattern = XXH64(this->gt_pattern[this->permutation_array[s]].alleles, + sizeof(uint16_t) * this->gt_pattern[this->permutation_array[s]].n_ploidy, + 651232); + // Update const_iterators for the hash mapper. + it = bin_used_map.find(hash_pattern); + end = bin_used_map.cend(); + if(it == end){ // Case: does not exist in map. + bin_used_map[hash_pattern] = bin_used.size(); + bin_used.push_back(std::vector()); + bin_used.back().push_back(&this->gt_pattern[this->permutation_array[s]]); + bin_used_packed_integer.push_back(this->gt_pattern[this->permutation_array[s]].GetPackedInteger(shift_size)); + } else { // Case: exist in map. + bin_used[it->second].push_back(&this->gt_pattern[this->permutation_array[s]]); + } + } + + // Sort by bin value. + for(U32 k = 0; k < bin_used.size(); ++k) + sort_helper.push_back(std::pair(bin_used_packed_integer[k], k)); + + std::sort(sort_helper.begin(), sort_helper.end()); + + U32 n_sample_c = 0; + for(U32 s = 0; s < sort_helper.size(); ++s){ + for(U32 k = 0; k < bin_used[sort_helper[s].second].size(); ++k, ++n_sample_c){ + permutation_array[n_sample_c] = bin_used[sort_helper[s].second][k]->id; + } + } + assert(n_sample_c == this->GetNumberSamples()); + + bin_used.clear(); + bin_used_map.clear(); + bin_used_packed_integer.clear(); + sort_helper.clear(); + } + + //this->Debug(std::cout, vcf_container, permutation_array); + + return true; +} + +void GenotypeSorter::Debug(std::ostream& stream, const vcf_container_type& vcf_container, const yon_gt_ppa& ppa){ + for(U32 i = 0; i < vcf_container.sizeWithoutCarryOver(); ++i){ + const bcf1_t* bcf = vcf_container[i]; + const uint8_t* gt = bcf->d.fmt[0].p; + const uint32_t base_ploidy = bcf->d.fmt[0].n; + assert(bcf->d.fmt[0].p_len == sizeof(int8_t) * base_ploidy * this->GetNumberSamples()); + + // Keep track of buffer position. + U32 gt_offset = 0; + + stream << bcf->pos + 1 << "\t"; + // Iterate over all available samples. + for(U32 s = 0; s < this->GetNumberSamples(); ++s){ + const uint8_t* gt_target = >[ppa[s] * sizeof(int8_t) * base_ploidy]; + + stream << (U32)(gt_target[0] >> 1); + // Iterate over the ploidy for this sample and update + // the allele for that chromosome in the pattern helper + // structure. + for(U32 a = 1; a < base_ploidy; ++a){ + stream << "|" << (U32)(gt_target[a] >> 1); + } + stream << "\t"; + } + stream << std::endl; + } +} + +} /* namespace IO */ +} /* namespace Tachyon */ diff --git a/lib/algorithm/permutation/genotype_sorter.h b/lib/algorithm/permutation/genotype_sorter.h new file mode 100644 index 0000000..1753e36 --- /dev/null +++ b/lib/algorithm/permutation/genotype_sorter.h @@ -0,0 +1,46 @@ +#ifndef ALGORITHM_COMPRESSION_RADIXSORTGT_H_ +#define ALGORITHM_COMPRESSION_RADIXSORTGT_H_ + +#include "core/genotypes.h" +#include "containers/vcf_container.h" + +namespace tachyon { +namespace algorithm { + +/* + * This class performs a radix sort on a + * block of variant lines given they are + * bi-allelic diploid. + */ +class GenotypeSorter { +public: + typedef GenotypeSorter self_type; + typedef containers::VcfContainer vcf_container_type; + +public: + GenotypeSorter(); + GenotypeSorter(const U64 n_samples); + ~GenotypeSorter(); + + // Reset does NOT need to cast after each + // iteration as values are overwritten + // each cycle + void reset(void); + void SetSamples(const U64 n_samples); + + bool Build(const vcf_container_type& vcf_container, io::VcfHeader& vcf_header); + void Debug(std::ostream& stream, const vcf_container_type& vcf_container, const yon_gt_ppa& ppa); + + inline const U64& GetNumberSamples(void) const{ return(this->n_samples); } + +public: + U64 n_samples; // total number of entries in file + yon_gt_ppa permutation_array; + yon_radix_gt* gt_pattern; + uint8_t gt_remap[256]; +}; + +} +} + +#endif /* ALGORITHM_COMPRESSION_RADIXSORTGT_H_ */ diff --git a/lib/algorithm/permutation/permutation_manager.cpp b/lib/algorithm/permutation/permutation_manager.cpp deleted file mode 100644 index 92787f6..0000000 --- a/lib/algorithm/permutation/permutation_manager.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "permutation_manager.h" - -namespace tachyon{ -namespace algorithm{ - -PermutationManager::PermutationManager() : - n_samples(0) -{} - -PermutationManager::PermutationManager(const U32 n_samples) : - n_samples(n_samples), - PPA(sizeof(U32)*n_samples) -{} - -PermutationManager::~PermutationManager(){ } - -void PermutationManager::setSamples(const U32 n_samples){ - this->n_samples = n_samples; - this->PPA.reset(); - this->PPA.resize(sizeof(S32)*n_samples); - - for(U32 i = 0; i < this->n_samples; ++i) - this->PPA += (U32)i; -} - -void PermutationManager::reset(void){ - for(U32 i = 0; i < this->n_samples; ++i) - (*this)[i] = i; - - this->header.reset(); - this->PPA.n_chars = this->n_samples*sizeof(U32); -} - -bool PermutationManager::generateCRC(void){ - // Checksum for main buffer - U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)this->PPA.buffer, this->n_samples*sizeof(U32)); - this->header.data_header.crc = crc; - return true; -} - -} -} diff --git a/lib/algorithm/permutation/permutation_manager.h b/lib/algorithm/permutation/permutation_manager.h deleted file mode 100644 index d04b1d8..0000000 --- a/lib/algorithm/permutation/permutation_manager.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef ALGORITHM_PERMUTATIONMANAGER_H_ -#define ALGORITHM_PERMUTATIONMANAGER_H_ - -#include - -#include "containers/components/data_container_header.h" -#include "third_party/zlib/zconf.h" -#include "third_party/zlib/zlib.h" -#include "io/basic_buffer.h" - - -namespace tachyon{ -namespace algorithm{ - -// Manages the PPA array and the Occ function -// of the PPA array -class PermutationManager{ - typedef PermutationManager self_type; - typedef io::BasicBuffer buffer_type; - typedef containers::DataContainerHeader header_type; - -public: - PermutationManager(); - PermutationManager(const U32 n_samples); - ~PermutationManager(); - - void setSamples(const U32 n_samples); - void reset(void); - bool generateCRC(void); - inline const U32 getObjectSize(void) const{ return(sizeof(U64) + this->header.data_header.cLength); } - - // Lookup - // Convenience function used during import - inline U32* get(void){ return(reinterpret_cast(this->PPA.buffer)); } - inline const U32* get(void) const{ return(reinterpret_cast(this->PPA.buffer)); } - inline U32& operator[](const U32& p){ return(*reinterpret_cast(&this->PPA.buffer[p * sizeof(U32)])); } - inline const U32& operator[](const U32& p) const{ return(*reinterpret_cast(&this->PPA.buffer[p * sizeof(U32)])); } - - -private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& manager){ - stream.write(reinterpret_cast(&manager.n_samples),sizeof(U64)); - stream.write(manager.PPA.data(), manager.header.data_header.cLength); - return(stream); - } - - friend std::ifstream& operator>>(std::ifstream& stream, self_type& manager){ - stream.read(reinterpret_cast(&manager.n_samples),sizeof(U64)); - manager.PPA.resize(manager.header.data_header.uLength); - stream.read(manager.PPA.data(), manager.header.data_header.cLength); - manager.PPA.n_chars = manager.header.data_header.cLength; - return(stream); - } - -public: - U64 n_samples; // redundancy but convenient - header_type header; - buffer_type PPA; -}; - -} -} - -#endif /* ALGORITHM_PERMUTATIONMANAGER_H_ */ diff --git a/lib/algorithm/permutation/radix_sort_gt.cpp b/lib/algorithm/permutation/radix_sort_gt.cpp deleted file mode 100644 index e4868b2..0000000 --- a/lib/algorithm/permutation/radix_sort_gt.cpp +++ /dev/null @@ -1,162 +0,0 @@ -#include "radix_sort_gt.h" - -#include - -namespace tachyon { -namespace algorithm { - -RadixSortGT::RadixSortGT() : - n_samples(0), - position(0), - GT_array(nullptr), - bins(new U32*[9]), - manager(nullptr) -{ - for(U32 i = 0; i < 9; ++i) bins[i] = nullptr; - memset(&p_i, 0, sizeof(U32)*9); -} - -RadixSortGT::~RadixSortGT(){ - delete [] this->GT_array; - for(U32 i = 0; i < 9; ++i) delete [] this->bins[i]; - delete [] this->bins; -} - -void RadixSortGT::setSamples(const U64 n_samples){ - this->n_samples = n_samples; - - // Delete previous - delete [] this->GT_array; - - // Set new - this->GT_array = new BYTE[this->n_samples]; - - // Reset - for(U32 i = 0; i < 9; ++i){ - this->bins[i] = new U32[n_samples]; - memset(this->bins[i], 0, sizeof(U32)*n_samples); - } - memset(this->GT_array, 0, sizeof(BYTE)*n_samples); - - this->manager->setSamples(n_samples); - this->manager->setSamples(n_samples*2); -} - -void RadixSortGT::reset(void){ - this->position = 0; - memset(this->GT_array, 0, sizeof(BYTE)*n_samples); - memset(&p_i, 0, sizeof(U32)*9); - this->manager->reset(); -} - -bool RadixSortGT::build(const bcf_reader_type& reader){ - if(reader.size() == 0) - return false; - - // Cycle over BCF entries - for(U32 i = 0; i < reader.size(); ++i){ - if(!this->update(reader[i])) - continue; - } - return(true); -} - -bool RadixSortGT::update(const bcf_entry_type& entry){ - // Check again because we might use it - // iteratively at some point in time - // i.e. not operating through the - // build() function - // Have to have genotypes available - if(entry.hasGenotypes == false) - return false; - - if(entry.gt_support.hasEOV || entry.gt_support.ploidy != 2) - return false; - - // Has to be biallelic - // otherwise skip - if(!entry.isBiallelic()) - return false; - - // Cycle over genotypes at this position - // Ignore phasing at this stage - // - // Genotype encodings are thus: - // 0/0 -> 0000b = 0 -> 0 - // 0/1 -> 0001b = 1 -> 3 - // 0/. -> 0010b = 2 -> 4 - // 1/0 -> 0100b = 4 -> 2 - // 1/1 -> 0101b = 5 -> 1 - // 1/. -> 0110b = 6 -> 5 - // ./0 -> 1000b = 8 -> 6 - // ./1 -> 1001b = 9 -> 7 - // ./. -> 1010b = 10 -> 8 - // - // Update GT_array - if(entry.formatID[0].primitive_type != 1){ - std::cerr << utility::timestamp("ERROR","PERMUTE") << "Illegal primitive: " << (int)entry.formatID[0].primitive_type << std::endl; - exit(1); - } - - U32 internal_pos = entry.formatID[0].l_offset; - U32 k = 0; - for(U32 i = 0; i < 2*this->n_samples; i += 2, ++k){ - const SBYTE& fmt_type_value1 = *reinterpret_cast(&entry.data[internal_pos++]); - const SBYTE& fmt_type_value2 = *reinterpret_cast(&entry.data[internal_pos++]); - const BYTE packed = (bcf::BCF_UNPACK_GENOTYPE(fmt_type_value2) << 2) | bcf::BCF_UNPACK_GENOTYPE(fmt_type_value1); - this->GT_array[k] = packed; - } - - // Build PPA - // 3^2 = 9 state radix sort over - // states: alleles \in {00, 01, 10} - // b entries in a YON block B - // This is equivalent to a radix sort - // on the alphabet {0,1,...,8} - U32 target_ID = 0; - for(U32 j = 0; j < this->n_samples; ++j){ - // Determine correct bin - switch(this->GT_array[(*this->manager)[j]]){ - case 0: target_ID = 0; break; // 0000: Ref, ref - case 1: target_ID = 3; break; // 0001: Ref, alt - case 2: target_ID = 4; break; // 0010: Ref, Missing - case 4: target_ID = 2; break; // 0100: Alt, ref - case 5: target_ID = 1; break; // 0101: Alt, alt - case 6: target_ID = 5; break; // 0110: Alt, missing - case 8: target_ID = 6; break; // 1000: Missing, ref - case 9: target_ID = 7; break; // 1001: Missing, alt - case 10: target_ID = 8; break; // 1010: Missing, missing - default: - std::cerr << utility::timestamp("ERROR","PERMUTE") << "Illegal state in radix sort..." << std::endl; - exit(1); - } - - // Update bin i at position i with ppa[j] - this->bins[target_ID][this->p_i[target_ID]] = (*this->manager)[j]; - ++this->p_i[target_ID]; - } // end loop over individuals at position i - - // Update PPA data - // Copy data in sorted order - U32 cum_pos = 0; - for(U32 i = 0; i < 9; ++i){ - // Copy data in bin i to current position - memcpy(&this->manager->PPA[cum_pos*sizeof(U32)], this->bins[i], this->p_i[i]*sizeof(U32)); - - // Update cumulative position and reset - cum_pos += this->p_i[i]; - this->p_i[i] = 0; - } - // Make sure the cumulative position - // equals the number of samples in the - // dataset - assert(cum_pos == this->n_samples); - - // Keep track of how many entries we've iterated over - ++this->position; - - return true; -} - -} /* namespace IO */ -} /* namespace Tachyon */ diff --git a/lib/algorithm/permutation/radix_sort_gt.h b/lib/algorithm/permutation/radix_sort_gt.h deleted file mode 100644 index 0a14a90..0000000 --- a/lib/algorithm/permutation/radix_sort_gt.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef ALGORITHM_COMPRESSION_RADIXSORTGT_H_ -#define ALGORITHM_COMPRESSION_RADIXSORTGT_H_ - -#include "io/bcf/BCFReader.h" -#include "permutation_manager.h" -#include "core/genotype_summary.h" - -namespace tachyon { -namespace algorithm { - -/* - * This class performs a radix sort on a - * block of variant lines given they are - * bi-allelic diploid. - */ -class RadixSortGT { - typedef RadixSortGT self_type; - typedef bcf::BCFReader bcf_reader_type; - typedef bcf::BCFEntry bcf_entry_type; - typedef PermutationManager manager_type; - -public: - RadixSortGT(); - RadixSortGT(const U64 n_samples); - ~RadixSortGT(); - - // Reset does NOT need to cast after each - // iteration as values are overwritten - // each cycle - void reset(void); - void setSamples(const U64 n_samples); - - // Construct given a reader with a block - // of BCF entries loaded in it - bool build(const bcf_reader_type& reader); - bool update(const bcf_entry_type& entry); - - inline const U64& getSamples(void) const{ return(this->n_samples); } - inline const U32& size(void) const{ return(this->position); } - -public: - U64 n_samples; // total number of entries in file - U32 position; // number of entries parsed - U32 p_i[9]; // number of entries in bin i - BYTE* GT_array; // packed genotype array - U32** bins; // bin i - manager_type* manager; // permutation manager -}; - -} /* namespace Algorithm */ -} /* namespace Tomahawk */ - -#endif /* ALGORITHM_COMPRESSION_RADIXSORTGT_H_ */ diff --git a/lib/containers/checksum_container.cpp b/lib/containers/checksum_container.cpp index 21cf4b1..52b963f 100644 --- a/lib/containers/checksum_container.cpp +++ b/lib/containers/checksum_container.cpp @@ -63,25 +63,25 @@ void ChecksumContainer::finalize(void){ bool ChecksumContainer::update(const block_type& block, const header_type& header){ for(U32 i = 0; i < block.footer.n_info_streams; ++i){ //assert(mapTable[block.index_entry.info_offsets[i].key] < this->size()); - if(!(*this)[block.footer.info_offsets[i].data_header.global_key].uncompressed.update(block.info_containers[i].buffer_data_uncompressed, block.info_containers[i].buffer_strides_uncompressed, block.info_containers[i].header.data_header.hasMixedStride())){ + if(!(*this)[block.footer.info_offsets[i].data_header.global_key].uncompressed.update(block.info_containers[i].buffer_data_uncompressed, block.info_containers[i].buffer_strides_uncompressed, block.info_containers[i].header.data_header.HasMixedStride())){ std::cerr << utility::timestamp("ERROR","DIGEST") << "Failed to update digest..." << std::endl; return false; } assert(block.footer.info_offsets[i].data_header.global_key < this->size()); - if(!(*this)[block.footer.info_offsets[i].data_header.global_key].compressed.update(block.info_containers[i].buffer_data, block.info_containers[i].buffer_strides, block.info_containers[i].header.data_header.hasMixedStride())){ + if(!(*this)[block.footer.info_offsets[i].data_header.global_key].compressed.update(block.info_containers[i].buffer_data, block.info_containers[i].buffer_strides, block.info_containers[i].header.data_header.HasMixedStride())){ std::cerr << utility::timestamp("ERROR","DIGEST") << "Failed to update digest..." << std::endl; return false; } } for(U32 i = 0; i < block.footer.n_format_streams; ++i){ - if(!(*this)[block.footer.format_offsets[i].data_header.global_key].uncompressed.update(block.format_containers[i].buffer_data_uncompressed, block.format_containers[i].buffer_strides_uncompressed, block.format_containers[i].header.data_header.hasMixedStride())){ + if(!(*this)[block.footer.format_offsets[i].data_header.global_key].uncompressed.update(block.format_containers[i].buffer_data_uncompressed, block.format_containers[i].buffer_strides_uncompressed, block.format_containers[i].header.data_header.HasMixedStride())){ std::cerr << utility::timestamp("ERROR","DIGEST") << "Failed to update digest..." << std::endl; return false; } - if(!(*this)[block.footer.format_offsets[i].data_header.global_key].compressed.update(block.format_containers[i].buffer_data, block.format_containers[i].buffer_strides, block.format_containers[i].header.data_header.hasMixedStride())){ + if(!(*this)[block.footer.format_offsets[i].data_header.global_key].compressed.update(block.format_containers[i].buffer_data, block.format_containers[i].buffer_strides, block.format_containers[i].header.data_header.HasMixedStride())){ std::cerr << utility::timestamp("ERROR","DIGEST") << "Failed to update digest..." << std::endl; return false; } diff --git a/lib/containers/checksum_container.h b/lib/containers/checksum_container.h index 904d64e..2ecf4d6 100644 --- a/lib/containers/checksum_container.h +++ b/lib/containers/checksum_container.h @@ -4,6 +4,7 @@ #include "algorithm/digest/digest_manager.h" #include "containers/variant_block.h" #include "core/header/variant_header.h" +#include "containers/components/generic_iterator.h" namespace tachyon{ namespace containers{ @@ -19,7 +20,10 @@ class ChecksumContainer { typedef const value_type* const_pointer; typedef io::BasicBuffer buffer_type; typedef containers::VariantBlock block_type; - typedef core::VariantHeader header_type; + typedef VariantHeader header_type; + + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; public: ChecksumContainer(void); @@ -28,40 +32,6 @@ class ChecksumContainer { ChecksumContainer(const buffer_type& buffer); ~ChecksumContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__entries[position]); } inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } @@ -75,7 +45,7 @@ class ChecksumContainer { inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } inline const size_type& capacity(void) const{ return(this->n_capacity); } diff --git a/lib/containers/components/data_block_bitvector.h b/lib/containers/components/data_block_bitvector.h deleted file mode 100644 index f936b6c..0000000 --- a/lib/containers/components/data_block_bitvector.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef INDEX_INDEXBLOCKENTRYBITVECTOR_H_ -#define INDEX_INDEXBLOCKENTRYBITVECTOR_H_ - -#include -#include - -#include "io/basic_buffer.h" - -namespace tachyon{ -namespace containers{ - -// Size of entries in these records are -// inferred from the number of INFO/FORMAT/FILTER -// entries in all the records in a block -struct DataBlockBitvector{ - typedef DataBlockBitvector self_type; - typedef std::size_t size_type; - typedef U32 value_type; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef value_type* pointer; - typedef const value_type* const_pointer; - -public: - DataBlockBitvector() : - n_keys(0), - local_keys(nullptr), - bit_bytes(nullptr) - { - - } - - ~DataBlockBitvector(){ - delete [] this->local_keys; - delete [] this->bit_bytes; - } - - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - inline void update(const BYTE& value, const U32& pos){ this->bit_bytes[pos] = value; } - - inline void allocate(const U32& n_keys, const U32& n_bytes){ - delete [] this->local_keys; - delete [] this->bit_bytes; - this->local_keys = new U32[n_keys]; - this->bit_bytes = new BYTE[n_bytes]; - memset(this->bit_bytes, 0, n_bytes); - this->n_keys = n_keys; - } - - inline void allocate(const U32& n_bytes){ - delete [] this->bit_bytes; - this->bit_bytes = new BYTE[n_bytes]; - memset(this->bit_bytes, 0, n_bytes); - } - - // Element access - inline reference key_at(const size_type& position){ return(this->local_keys[position]); } - inline const_reference key_at(const size_type& position) const{ return(this->local_keys[position]); } - inline pointer key_data(void){ return(this->local_keys); } - inline const_pointer key_data(void) const{ return(this->local_keys); } - inline reference key_front(void){ return(this->local_keys[0]); } - inline const_reference key_front(void) const{ return(this->local_keys[0]); } - inline reference key_back(void){ return(this->local_keys[this->n_keys - 1]); } - inline const_reference key_back(void) const{ return(this->local_keys[this->n_keys - 1]); } - - // Bit access - inline const bool operator[](const U32 position) const{ return((this->bit_bytes[position / 8] & (1 << (position % 8))) >> (position % 8)); } - - // Capacity - inline const bool empty(void) const{ return(this->n_keys == 0); } - inline const value_type& size(void) const{ return(this->n_keys); } - - // Iterator - inline iterator begin(){ return iterator(&this->local_keys[0]); } - inline iterator end() { return iterator(&this->local_keys[this->n_keys - 1]); } - inline const_iterator begin() const{ return const_iterator(&this->local_keys[0]); } - inline const_iterator end() const{ return const_iterator(&this->local_keys[this->n_keys]); } - inline const_iterator cbegin() const{ return const_iterator(&this->local_keys[0]); } - inline const_iterator cend() const{ return const_iterator(&this->local_keys[this->n_keys]); } - - // Utility - inline const U32 getBaseSize(void) const{ return(sizeof(U32) + sizeof(U32)*this->n_keys); } - -private: - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& entry){ - buffer += entry.n_keys; - for(U32 i = 0; i < entry.n_keys; ++i) buffer += entry.local_keys[i]; - - return(buffer); - } - - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& entry){ - delete [] entry.local_keys; - buffer >> entry.n_keys; - entry.local_keys = new U32[entry.n_keys]; - for(U32 i = 0; i < entry.n_keys; ++i) buffer >> entry.local_keys[i]; - - return(buffer); - } - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream.write(reinterpret_cast(&entry.n_keys), sizeof(U32)); - for(U32 i = 0; i < entry.n_keys; ++i) - stream.write(reinterpret_cast(&entry.local_keys[i]), sizeof(U32)); - - return(stream); - } - - friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry){ - delete [] entry.local_keys; - stream.read(reinterpret_cast(&entry.n_keys), sizeof(U32)); - entry.local_keys = new U32[entry.n_keys]; - for(U32 i = 0; i < entry.n_keys; ++i) - stream.read(reinterpret_cast(&entry.local_keys[i]), sizeof(U32)); - - return(stream); - } - -public: - value_type n_keys; - pointer local_keys; - BYTE* bit_bytes; -}; - -} -} - -#endif /* INDEX_INDEXBLOCKENTRYBITVECTOR_H_ */ diff --git a/lib/containers/components/data_container_header.h b/lib/containers/components/data_container_header.h index 68e65e0..7232299 100644 --- a/lib/containers/components/data_container_header.h +++ b/lib/containers/components/data_container_header.h @@ -44,7 +44,7 @@ struct DataContainerHeader{ } // Comparators - const bool operator==(const self_type& other) const{ + bool operator==(const self_type& other) const{ if(this->identifier != other.identifier) return false; if(this->n_entries != other.n_entries) return false; if(this->n_additions != other.n_additions) return false; @@ -53,7 +53,7 @@ struct DataContainerHeader{ if(this->stride_header != other.stride_header) return false; return true; } - inline const bool operator!=(const self_type& other) const{ return(!(*this == other)); } + inline bool operator!=(const self_type& other) const{ return(!(*this == other)); } self_type& operator+=(const self_type& other){ this->n_entries += other.n_entries; @@ -65,7 +65,7 @@ struct DataContainerHeader{ // Accessors inline S32& getGlobalKey(void){ return(this->data_header.global_key); } inline const S32& getGlobalKey(void) const{ return(this->data_header.global_key); } - inline const bool hasMixedStride(void) const{ return(this->data_header.hasMixedStride()); } + inline bool hasMixedStride(void) const{ return(this->data_header.HasMixedStride()); } private: friend buffer_type& operator<<(buffer_type& buffer, const self_type& entry){ @@ -75,7 +75,7 @@ struct DataContainerHeader{ buffer += entry.n_strides; buffer << entry.data_header; - if(entry.data_header.hasMixedStride()) buffer << entry.stride_header; + if(entry.data_header.HasMixedStride()) buffer << entry.stride_header; return(buffer); } @@ -87,7 +87,7 @@ struct DataContainerHeader{ buffer >> entry.n_strides; buffer >> entry.data_header; - if(entry.data_header.hasMixedStride()) buffer >> entry.stride_header; + if(entry.data_header.HasMixedStride()) buffer >> entry.stride_header; return(buffer); } @@ -98,7 +98,7 @@ struct DataContainerHeader{ stream.write(reinterpret_cast(&entry.n_additions), sizeof(U32)); stream.write(reinterpret_cast(&entry.n_strides), sizeof(U32)); stream << entry.data_header; - if(entry.data_header.hasMixedStride()) + if(entry.data_header.HasMixedStride()) stream << entry.stride_header; return(stream); @@ -110,7 +110,7 @@ struct DataContainerHeader{ stream.read(reinterpret_cast(&entry.n_additions), sizeof(U32)); stream.read(reinterpret_cast(&entry.n_strides), sizeof(U32)); stream >> entry.data_header; - if(entry.data_header.hasMixedStride()) + if(entry.data_header.HasMixedStride()) stream >> entry.stride_header; return(stream); diff --git a/lib/containers/components/data_container_header_controller.h b/lib/containers/components/data_container_header_controller.h index d183430..18bd757 100644 --- a/lib/containers/components/data_container_header_controller.h +++ b/lib/containers/components/data_container_header_controller.h @@ -15,6 +15,7 @@ namespace containers{ // Controller type for stream container struct DataContainerHeaderController{ typedef DataContainerHeaderController self_type; + typedef io::BasicBuffer buffer_type; public: DataContainerHeaderController() : @@ -37,9 +38,9 @@ struct DataContainerHeaderController{ this->encryption = 0; } - inline const bool isEncrypted(void) const{ return(this->encryption > 0); } - inline const bool compareType(const BYTE& type) const{ return(this->type == type); } - inline const bool compareTypeSign(const BYTE& type, const bool& sign) const{ return(this->type == type && this->signedness == sign); } + inline bool isEncrypted(void) const{ return(this->encryption > 0); } + inline bool compareType(const BYTE& type) const{ return(this->type == type); } + inline bool compareTypeSign(const BYTE& type, const bool& sign) const{ return(this->type == type && this->signedness == sign); } self_type& operator=(const self_type& other){ this->signedness = other.signedness; @@ -51,7 +52,7 @@ struct DataContainerHeaderController{ return(*this); } - const bool operator==(const self_type& other) const{ + bool operator==(const self_type& other) const{ if(this->signedness != other.signedness) return false; if(this->mixedStride != other.mixedStride) return false; if(this->type != other.type) return false; @@ -60,12 +61,19 @@ struct DataContainerHeaderController{ if(this->encryption != other.encryption) return false; return true; } - inline const bool operator!=(const self_type& other) const{ return(!(*this == other)); } + inline bool operator!=(const self_type& other) const{ return(!(*this == other)); } private: - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer,const self_type& controller){ - const U16* c = reinterpret_cast(&controller); - buffer += *c; + friend buffer_type& operator<<(buffer_type& buffer,const self_type& controller){ + const U16 c = controller.signedness << 0 | + controller.mixedStride << 1 | + controller.type << 2 | + controller.encoder << 8 | + controller.uniform << 13 | + controller.encryption << 14; + + //const U16* c = reinterpret_cast(&controller); + buffer += c; return(buffer); } @@ -77,7 +85,7 @@ struct DataContainerHeaderController{ controller.uniform << 13 | controller.encryption << 14; - assert(*reinterpret_cast(&controller) == c); + //assert(*reinterpret_cast(&controller) == c); stream.write(reinterpret_cast(&c), sizeof(U16)); return(stream); @@ -88,7 +96,7 @@ struct DataContainerHeaderController{ return(stream); } - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& controller){ + friend buffer_type& operator>>(buffer_type& buffer, self_type& controller){ U16* c = reinterpret_cast(&controller); buffer >> *c; return(buffer); diff --git a/lib/containers/components/data_container_header_object.cpp b/lib/containers/components/data_container_header_object.cpp index d8803a5..10d8c7f 100644 --- a/lib/containers/components/data_container_header_object.cpp +++ b/lib/containers/components/data_container_header_object.cpp @@ -9,9 +9,10 @@ DataContainerHeaderObject::DataContainerHeaderObject() : cLength(0), uLength(0), eLength(0), - crc(0), global_key(-1) -{} +{ + memset(&this->crc[0], 0, MD5_DIGEST_LENGTH); +} DataContainerHeaderObject::DataContainerHeaderObject(const DataContainerHeaderObject& other) : controller(other.controller), @@ -20,9 +21,9 @@ DataContainerHeaderObject::DataContainerHeaderObject(const DataContainerHeaderOb cLength(other.cLength), uLength(other.uLength), eLength(other.eLength), - crc(other.crc), global_key(other.global_key) { + memcpy(&this->crc[0], &other.crc[0], MD5_DIGEST_LENGTH); } DataContainerHeaderObject::DataContainerHeaderObject(DataContainerHeaderObject&& other) noexcept : @@ -32,13 +33,11 @@ DataContainerHeaderObject::DataContainerHeaderObject(DataContainerHeaderObject&& cLength(other.cLength), uLength(other.uLength), eLength(other.eLength), - crc(other.crc), global_key(other.global_key) { - + memcpy(&this->crc[0], &other.crc[0], MD5_DIGEST_LENGTH); } - // copy assignment DataContainerHeaderObject& DataContainerHeaderObject::operator=(const DataContainerHeaderObject& other){ this->controller = other.controller; this->stride = other.stride; @@ -46,13 +45,11 @@ DataContainerHeaderObject& DataContainerHeaderObject::operator=(const DataContai this->cLength = other.cLength; this->uLength = other.uLength; this->eLength = other.eLength; - this->crc = other.crc; + memcpy(&this->crc[0], &other.crc[0], MD5_DIGEST_LENGTH); this->global_key = other.global_key; return *this; } - -/** Move assignment operator */ DataContainerHeaderObject& DataContainerHeaderObject::operator=(DataContainerHeaderObject&& other) noexcept{ this->controller = other.controller; this->stride = other.stride; @@ -60,7 +57,7 @@ DataContainerHeaderObject& DataContainerHeaderObject::operator=(DataContainerHea this->cLength = other.cLength; this->uLength = other.uLength; this->eLength = other.eLength; - this->crc = other.crc; + memcpy(&this->crc[0], &other.crc[0], MD5_DIGEST_LENGTH); this->global_key = other.global_key; return *this; } @@ -73,23 +70,25 @@ void DataContainerHeaderObject::reset(void){ this->offset = 0; this->cLength = 0; this->uLength = 0; - this->crc = 0; + memset(&this->crc[0], 0, MD5_DIGEST_LENGTH); this->global_key = -1; } -const bool DataContainerHeaderObject::operator==(const self_type& other) const{ +bool DataContainerHeaderObject::operator==(const self_type& other) const{ if(this->stride != other.stride) return false; if(this->offset != other.offset) return false; if(this->cLength != other.cLength) return false; if(this->uLength != other.uLength) return false; if(this->eLength != other.eLength) return false; - if(this->crc != other.crc) return false; if(this->global_key != other.global_key) return false; if(this->controller != other.controller) return false; + for(U32 i = 0; i < MD5_DIGEST_LENGTH; ++i) + if(this->crc[i] != other.crc[i]) return false; + return true; } -const SBYTE DataContainerHeaderObject::getPrimitiveWidth(void) const{ +SBYTE DataContainerHeaderObject::GetPrimitiveWidth(void) const{ // We do not care about signedness here switch(this->controller.type){ case(YON_TYPE_UNKNOWN): diff --git a/lib/containers/components/data_container_header_object.h b/lib/containers/components/data_container_header_object.h index ea8f70c..7717bd5 100644 --- a/lib/containers/components/data_container_header_object.h +++ b/lib/containers/components/data_container_header_object.h @@ -5,6 +5,8 @@ #include "support/enums.h" #include "data_container_header_controller.h" +#include "openssl/md5.h" + namespace tachyon{ namespace containers{ @@ -20,32 +22,38 @@ struct DataContainerHeaderObject{ ~DataContainerHeaderObject(); void reset(void); - const bool operator==(const self_type& other) const; - inline const bool operator!=(const self_type& other) const{ return(!(*this == other)); } + bool operator==(const self_type& other) const; + inline bool operator!=(const self_type& other) const{ return(!(*this == other)); } - const SBYTE getPrimitiveWidth(void) const; + SBYTE GetPrimitiveWidth(void) const; // - inline S32& getStride(void){ return(this->stride); } - inline const S32& getStride(void) const{ return(this->stride); } + inline S32& GetStride(void){ return(this->stride); } + inline const S32& GetStride(void) const{ return(this->stride); } - inline const bool isUniform(void) const{ return(this->controller.uniform); } - inline const bool isSigned(void) const{ return(this->controller.signedness); } - inline const bool hasMixedStride(void) const{ return(this->controller.mixedStride); } - inline void setUniform(const bool yes){ this->controller.uniform = yes; } - inline void setSignedness(const bool yes){ this->controller.signedness = yes; } - inline void setMixedStride(const bool yes){ this->controller.mixedStride = yes; } + inline bool IsUniform(void) const{ return(this->controller.uniform); } + inline bool IsSigned(void) const{ return(this->controller.signedness); } + inline bool HasMixedStride(void) const{ return(this->controller.mixedStride); } + inline void SetUniform(const bool yes){ this->controller.uniform = yes; } + inline void SetSignedness(const bool yes){ this->controller.signedness = yes; } + inline void SetMixedStride(const bool yes){ this->controller.mixedStride = yes; } - inline const TACHYON_CORE_TYPE getPrimitiveType(void) const{ return(TACHYON_CORE_TYPE(this->controller.type)); } - inline const TACHYON_CORE_COMPRESSION getEncoder(void) const{ return(TACHYON_CORE_COMPRESSION(this->controller.encoder)); } + inline TACHYON_CORE_TYPE GetPrimitiveType(void) const{ return(TACHYON_CORE_TYPE(this->controller.type)); } + inline TACHYON_CORE_COMPRESSION GetEncoder(void) const{ return(TACHYON_CORE_COMPRESSION(this->controller.encoder)); } // Set types - inline void setType(const TACHYON_CORE_TYPE& type){ this->controller.type = type; } + inline void SetType(const TACHYON_CORE_TYPE& type){ this->controller.type = type; } // Checksum - inline U32& getChecksum(void){ return(this->crc); } - inline const U32& getChecksum(void) const{ return(this->crc); } - inline const bool checkChecksum(const U32 checksum) const{ return(this->crc == checksum); } + inline uint8_t* GetChecksum(void){ return(&this->crc[0]); } + inline const uint8_t* GetChecksum(void) const{ return(&this->crc[0]); } + bool CheckChecksum(const uint8_t* compare) const{ + for(U32 i = 0; i < MD5_DIGEST_LENGTH; ++i){ + if(compare[i] != this->crc[i]) + return false; + } + return true; + } private: friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& entry){ @@ -55,7 +63,7 @@ struct DataContainerHeaderObject{ buffer += entry.cLength; buffer += entry.uLength; buffer += entry.eLength; - buffer += entry.crc; + for(U32 i = 0; i < MD5_DIGEST_LENGTH; ++i) buffer += entry.crc[i]; buffer += entry.global_key; return(buffer); } @@ -67,7 +75,7 @@ struct DataContainerHeaderObject{ stream.write(reinterpret_cast(&entry.cLength), sizeof(U32)); stream.write(reinterpret_cast(&entry.uLength), sizeof(U32)); stream.write(reinterpret_cast(&entry.eLength), sizeof(U32)); - stream.write(reinterpret_cast(&entry.crc), sizeof(U32)); + stream.write(reinterpret_cast(&entry.crc[0]), sizeof(uint8_t)*MD5_DIGEST_LENGTH); stream.write(reinterpret_cast(&entry.global_key),sizeof(S32)); return(stream); } @@ -79,7 +87,7 @@ struct DataContainerHeaderObject{ buffer >> entry.cLength; buffer >> entry.uLength; buffer >> entry.eLength; - buffer >> entry.crc; + for(U32 i = 0; i < MD5_DIGEST_LENGTH; ++i) buffer >> entry.crc[i]; buffer >> entry.global_key; return(buffer); } @@ -91,7 +99,7 @@ struct DataContainerHeaderObject{ stream.read(reinterpret_cast(&entry.cLength), sizeof(U32)); stream.read(reinterpret_cast(&entry.uLength), sizeof(U32)); stream.read(reinterpret_cast(&entry.eLength), sizeof(U32)); - stream.read(reinterpret_cast(&entry.crc), sizeof(U32)); + stream.read(reinterpret_cast(&entry.crc[0]), sizeof(uint8_t)*MD5_DIGEST_LENGTH); stream.read(reinterpret_cast(&entry.global_key), sizeof(S32)); return(stream); @@ -104,7 +112,7 @@ struct DataContainerHeaderObject{ U32 cLength; // compressed length U32 uLength; // uncompressed length U32 eLength; // encrypted length - U32 crc; // crc32 checksum + uint8_t crc[MD5_DIGEST_LENGTH]; // MD5 checksum S32 global_key; // global key }; diff --git a/lib/containers/components/generic_iterator.h b/lib/containers/components/generic_iterator.h new file mode 100644 index 0000000..be362c2 --- /dev/null +++ b/lib/containers/components/generic_iterator.h @@ -0,0 +1,70 @@ +#ifndef CONTAINERS_COMPONENTS_GENERIC_ITERATOR_H_ +#define CONTAINERS_COMPONENTS_GENERIC_ITERATOR_H_ + +#include +#include + +namespace tachyon{ + +//------------------------------------------------------------------- +// Raw iterator with random access +//------------------------------------------------------------------- +template +class yonRawIterator : public std::iterator +{ +public: + typedef yonRawIterator self_type; + typedef DataType value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + +public: + yonRawIterator(DataType* ptr = nullptr){m_ptr = ptr;} + yonRawIterator(const self_type& rawIterator) = default; + ~yonRawIterator(){} + + self_type& operator=(const self_type& rawIterator) = default; + self_type& operator=(pointer ptr){m_ptr = ptr;return (*this);} + + operator bool() const { + if(m_ptr) return true; + else return false; + } + + bool operator==(const self_type& rawIterator)const{return (m_ptr == rawIterator.getConstPtr());} + bool operator!=(const self_type& rawIterator)const{return (m_ptr != rawIterator.getConstPtr());} + + self_type& operator+=(const ptrdiff_t& movement){m_ptr += movement;return (*this);} + self_type& operator-=(const ptrdiff_t& movement){m_ptr -= movement;return (*this);} + self_type& operator++(){++m_ptr;return (*this);} + self_type& operator--(){--m_ptr;return (*this);} + self_type operator++(int){auto temp(*this);++m_ptr;return temp;} + self_type operator--(int){auto temp(*this);--m_ptr;return temp;} + self_type operator+(const ptrdiff_t& movement){auto oldPtr = m_ptr;m_ptr+=movement;auto temp(*this);m_ptr = oldPtr;return temp;} + self_type operator-(const ptrdiff_t& movement){auto oldPtr = m_ptr;m_ptr-=movement;auto temp(*this);m_ptr = oldPtr;return temp;} + + ptrdiff_t operator-(const self_type& rawIterator){return std::distance(rawIterator.getPtr(),this->getPtr());} + + reference operator*(){return *m_ptr;} + const_reference operator*()const{return *m_ptr;} + pointer operator->(){return m_ptr;} + pointer getPtr()const{return m_ptr;} + const_pointer getConstPtr()const{return m_ptr;} + +protected: + pointer m_ptr; +}; + +} + + + +#endif /* CONTAINERS_COMPONENTS_GENERIC_ITERATOR_H_ */ diff --git a/lib/containers/components/variant_block_footer.cpp b/lib/containers/components/variant_block_footer.cpp index 6bb7012..3928d56 100644 --- a/lib/containers/components/variant_block_footer.cpp +++ b/lib/containers/components/variant_block_footer.cpp @@ -16,60 +16,51 @@ VariantBlockFooter::VariantBlockFooter(): n_info_patterns(0), n_format_patterns(0), n_filter_patterns(0), + offsets(new header_type[YON_BLK_N_STATIC]), info_offsets(nullptr), format_offsets(nullptr), filter_offsets(nullptr), - info_bit_vectors(nullptr), - format_bit_vectors(nullptr), - filter_bit_vectors(nullptr) + n_info_patterns_allocated(0), + n_format_patterns_allocated(0), + n_filter_patterns_allocated(0), + info_patterns(nullptr), + format_patterns(nullptr), + filter_patterns(nullptr), + info_map(nullptr), + format_map(nullptr), + filter_map(nullptr), + info_pattern_map(nullptr), + format_pattern_map(nullptr), + filter_pattern_map(nullptr) {} VariantBlockFooter::~VariantBlockFooter(){ + delete [] this->offsets; delete [] this->info_offsets; delete [] this->format_offsets; delete [] this->filter_offsets; - delete [] this->info_bit_vectors; - delete [] this->format_bit_vectors; - delete [] this->filter_bit_vectors; + delete [] this->info_patterns; + delete [] this->format_patterns; + delete [] this->filter_patterns; + delete this->info_map; + delete this->format_map; + delete this->filter_map; + delete this->info_pattern_map; + delete this->format_pattern_map; + delete this->filter_pattern_map; } void VariantBlockFooter::reset(void){ // Headers of the various containers - this->offset_ppa.reset(); - this->offset_meta_contig.reset(); - this->offset_meta_position.reset(); - this->offset_meta_refalt.reset(); - this->offset_meta_controllers.reset(); - this->offset_meta_quality.reset(); - this->offset_meta_names.reset(); - this->offset_meta_alleles.reset(); - this->offset_meta_info_id.reset(); - this->offset_meta_format_id.reset(); - this->offset_meta_filter_id.reset(); - this->offset_gt_8b.reset(); - this->offset_gt_16b.reset(); - this->offset_gt_32b.reset(); - this->offset_gt_64b.reset(); - this->offset_gt_simple8.reset(); - this->offset_gt_simple16.reset(); - this->offset_gt_simple32.reset(); - this->offset_gt_simple64.reset(); - this->offset_gt_helper.reset(); + for(U32 i = 0; i < YON_BLK_N_STATIC; ++i) this->offsets[i].reset(); - delete [] this->info_offsets; - delete [] this->format_offsets; - delete [] this->filter_offsets; - this->info_offsets = nullptr; - this->format_offsets = nullptr; - this->filter_offsets = nullptr; + for(U32 i = 0; i < this->n_info_streams; ++i) this->info_offsets[i].reset(); + for(U32 i = 0; i < this->n_format_streams; ++i) this->format_offsets[i].reset(); + for(U32 i = 0; i < this->n_filter_streams; ++i) this->filter_offsets[i].reset(); - // Bit vectors - delete [] this->info_bit_vectors; - delete [] this->format_bit_vectors; - delete [] this->filter_bit_vectors; - this->info_bit_vectors = nullptr; - this->format_bit_vectors = nullptr; - this->filter_bit_vectors = nullptr; + for(U32 i = 0; i < this->n_info_patterns; ++i) this->info_patterns[i].clear(); + for(U32 i = 0; i < this->n_format_patterns; ++i) this->format_patterns[i].clear(); + for(U32 i = 0; i < this->n_filter_patterns; ++i) this->filter_patterns[i].clear(); this->n_info_streams = 0; this->n_format_streams = 0; @@ -77,207 +68,17 @@ void VariantBlockFooter::reset(void){ this->n_info_patterns = 0; this->n_format_patterns = 0; this->n_filter_patterns = 0; -} -bool VariantBlockFooter::constructBitVector(const INDEX_BLOCK_TARGET& target, hash_container_type& values, hash_vector_container_type& patterns){ - if(values.size() == 0) - return false; - - // Determine target - switch(target){ - case(INDEX_BLOCK_TARGET::INDEX_INFO) : - this->n_info_patterns = patterns.size(); - return(this->__constructBitVector(this->info_bit_vectors, this->info_offsets, values, patterns)); - break; - case(INDEX_BLOCK_TARGET::INDEX_FORMAT) : - this->n_format_patterns = patterns.size(); - return(this->__constructBitVector(this->format_bit_vectors, this->format_offsets, values, patterns)); - break; - case(INDEX_BLOCK_TARGET::INDEX_FILTER) : - this->n_filter_patterns = patterns.size(); - return(this->__constructBitVector(this->filter_bit_vectors, this->filter_offsets, values, patterns)); - break; - default: std::cerr << "unknown target type" << std::endl; exit(1); - } - - return false; + this->resetTables(); } -bool VariantBlockFooter::__constructBitVector(bit_vector*& target, - header_type* offset, - hash_container_type& values, - hash_vector_container_type& patterns) -{ - if(values.size() == 0) return false; - - // Determine the required width in bytes of the bit-vector - BYTE bitvector_width = ceil((float)values.size()/8); - - // Allocate new bit-vectors - delete [] target; - target = new bit_vector[patterns.size()]; - - // Allocate memory for these bit-vectors - for(U32 i = 0; i < patterns.size(); ++i) - target[i].allocate(patterns[i].size(), bitvector_width); - - // Cycle over pattern size - for(U32 i = 0; i < patterns.size(); ++i){ - for(U32 j = 0; j < patterns[i].size(); ++j){ - // Set arbitrary local key: this value is update by reference in `getRaw()` - U32 local_key = 0; - - // Map from absolute key to local key - if(!values.getRaw(patterns[i][j], local_key)){ - std::cerr << "impossible to get " << patterns[i][j] << std::endl; - exit(1); - } - - // Set bit at local key position - target[i].bit_bytes[local_key/8] |= 1 << (local_key % 8); - - // Store local key in key-chain - target[i].local_keys[j] = local_key; - - // Store absolute key - offset[local_key].data_header.global_key = patterns[i][j]; - } - } - return true; -} - -std::ostream& operator<<(std::ostream& stream, const VariantBlockFooter& entry){ - stream.write(reinterpret_cast(&entry.n_info_streams), sizeof(U16)); - stream.write(reinterpret_cast(&entry.n_format_streams), sizeof(U16)); - stream.write(reinterpret_cast(&entry.n_filter_streams), sizeof(U16)); - stream.write(reinterpret_cast(&entry.n_info_patterns), sizeof(U16)); - stream.write(reinterpret_cast(&entry.n_format_patterns), sizeof(U16)); - stream.write(reinterpret_cast(&entry.n_filter_patterns), sizeof(U16)); - - stream << entry.offset_ppa; - stream << entry.offset_meta_contig; - stream << entry.offset_meta_position; - stream << entry.offset_meta_refalt; - stream << entry.offset_meta_controllers; - stream << entry.offset_meta_quality; - stream << entry.offset_meta_names; - stream << entry.offset_meta_alleles; - stream << entry.offset_meta_info_id; - stream << entry.offset_meta_format_id; - stream << entry.offset_meta_filter_id; - stream << entry.offset_gt_8b; - stream << entry.offset_gt_16b; - stream << entry.offset_gt_32b; - stream << entry.offset_gt_64b; - stream << entry.offset_gt_simple8; - stream << entry.offset_gt_simple16; - stream << entry.offset_gt_simple32; - stream << entry.offset_gt_simple64; - stream << entry.offset_gt_helper; - - for(U32 i = 0; i < entry.n_info_streams; ++i) stream << entry.info_offsets[i]; - for(U32 i = 0; i < entry.n_format_streams; ++i) stream << entry.format_offsets[i]; - for(U32 i = 0; i < entry.n_filter_streams; ++i) stream << entry.filter_offsets[i]; - - // write - if(entry.n_info_patterns){ - const BYTE info_bitvector_width = ceil((float)entry.n_info_streams/8); - for(U32 i = 0; i < entry.n_info_patterns; ++i){ - stream << entry.info_bit_vectors[i]; - stream.write((const char*)entry.info_bit_vectors[i].bit_bytes, info_bitvector_width); - } - } - - if(entry.n_format_patterns){ - const BYTE format_bitvector_width = ceil((float)entry.n_format_streams/8); - for(U32 i = 0; i < entry.n_format_patterns; ++i){ - stream << entry.format_bit_vectors[i]; - stream.write((const char*)entry.format_bit_vectors[i].bit_bytes, format_bitvector_width); - } - } - - if(entry.n_filter_patterns){ - const BYTE filter_bitvector_width = ceil((float)entry.n_filter_streams/8); - for(U32 i = 0; i < entry.n_filter_patterns; ++i){ - stream << entry.filter_bit_vectors[i]; - stream.write((const char*)entry.filter_bit_vectors[i].bit_bytes, filter_bitvector_width); - } - } - - return(stream); -} - -std::ifstream& operator>>(std::ifstream& stream, VariantBlockFooter& entry){ - stream.read(reinterpret_cast(&entry.n_info_streams), sizeof(U16)); - stream.read(reinterpret_cast(&entry.n_format_streams), sizeof(U16)); - stream.read(reinterpret_cast(&entry.n_filter_streams), sizeof(U16)); - stream.read(reinterpret_cast(&entry.n_info_patterns), sizeof(U16)); - stream.read(reinterpret_cast(&entry.n_format_patterns), sizeof(U16)); - stream.read(reinterpret_cast(&entry.n_filter_patterns), sizeof(U16)); - - entry.l_info_bitvector = ceil((float)entry.n_info_streams/8); - entry.l_format_bitvector = ceil((float)entry.n_format_streams/8); - entry.l_filter_bitvector = ceil((float)entry.n_filter_streams/8); - - stream >> entry.offset_ppa; - stream >> entry.offset_meta_contig; - stream >> entry.offset_meta_position; - stream >> entry.offset_meta_refalt; - stream >> entry.offset_meta_controllers; - stream >> entry.offset_meta_quality; - stream >> entry.offset_meta_names; - stream >> entry.offset_meta_alleles; - stream >> entry.offset_meta_info_id; - stream >> entry.offset_meta_format_id; - stream >> entry.offset_meta_filter_id; - stream >> entry.offset_gt_8b; - stream >> entry.offset_gt_16b; - stream >> entry.offset_gt_32b; - stream >> entry.offset_gt_64b; - stream >> entry.offset_gt_simple8; - stream >> entry.offset_gt_simple16; - stream >> entry.offset_gt_simple32; - stream >> entry.offset_gt_simple64; - stream >> entry.offset_gt_helper; - - entry.info_offsets = new DataContainerHeader[entry.n_info_streams]; - entry.format_offsets = new DataContainerHeader[entry.n_format_streams]; - entry.filter_offsets = new DataContainerHeader[entry.n_filter_streams]; - for(U32 i = 0; i < entry.n_info_streams; ++i) stream >> entry.info_offsets[i]; - for(U32 i = 0; i < entry.n_format_streams; ++i) stream >> entry.format_offsets[i]; - for(U32 i = 0; i < entry.n_filter_streams; ++i) stream >> entry.filter_offsets[i]; - - if(entry.n_info_patterns){ - BYTE info_bitvector_width = ceil((float)entry.n_info_streams/8); - entry.info_bit_vectors = new DataBlockBitvector[entry.n_info_patterns]; - for(U32 i = 0; i < entry.n_info_patterns; ++i){ - stream >> entry.info_bit_vectors[i]; - entry.info_bit_vectors[i].allocate(info_bitvector_width); - stream.read((char*)entry.info_bit_vectors[i].bit_bytes, info_bitvector_width); - } - } - - if(entry.n_format_patterns){ - BYTE format_bitvector_width = ceil((float)entry.n_format_streams/8); - entry.format_bit_vectors = new DataBlockBitvector[entry.n_format_patterns]; - for(U32 i = 0; i < entry.n_format_patterns; ++i){ - stream >> entry.format_bit_vectors[i]; - entry.format_bit_vectors[i].allocate(format_bitvector_width); - stream.read((char*)entry.format_bit_vectors[i].bit_bytes, format_bitvector_width); - } - } - - if(entry.n_filter_patterns){ - BYTE filter_bitvector_width = ceil((float)entry.n_filter_streams/8); - entry.filter_bit_vectors = new DataBlockBitvector[entry.n_filter_patterns]; - for(U32 i = 0; i < entry.n_filter_patterns; ++i){ - stream >> entry.filter_bit_vectors[i]; - entry.filter_bit_vectors[i].allocate(filter_bitvector_width); - stream.read((char*)entry.filter_bit_vectors[i].bit_bytes, filter_bitvector_width); - } - } - - return(stream); +void VariantBlockFooter::resetTables(){ + if(this->info_map != nullptr) this->info_map->clear(); + if(this->format_map != nullptr) this->format_map->clear(); + if(this->filter_map != nullptr) this->filter_map->clear(); + if(this->info_pattern_map != nullptr) this->info_pattern_map->clear(); + if(this->format_pattern_map != nullptr) this->format_pattern_map->clear(); + if(this->filter_pattern_map != nullptr) this->filter_pattern_map->clear(); } io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const VariantBlockFooter& entry){ @@ -288,60 +89,22 @@ io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const VariantBlockFooter& e buffer += (U16)entry.n_format_patterns; buffer += (U16)entry.n_filter_patterns; - buffer << entry.offset_ppa; - buffer << entry.offset_meta_contig; - buffer << entry.offset_meta_position; - buffer << entry.offset_meta_refalt; - buffer << entry.offset_meta_controllers; - buffer << entry.offset_meta_quality; - buffer << entry.offset_meta_names; - buffer << entry.offset_meta_alleles; - buffer << entry.offset_meta_info_id; - buffer << entry.offset_meta_format_id; - buffer << entry.offset_meta_filter_id; - buffer << entry.offset_gt_8b; - buffer << entry.offset_gt_16b; - buffer << entry.offset_gt_32b; - buffer << entry.offset_gt_64b; - buffer << entry.offset_gt_simple8; - buffer << entry.offset_gt_simple16; - buffer << entry.offset_gt_simple32; - buffer << entry.offset_gt_simple64; - buffer << entry.offset_gt_helper; - + for(U32 i = 0; i < YON_BLK_N_STATIC; ++i) buffer << entry.offsets[i]; for(U32 i = 0; i < entry.n_info_streams; ++i) buffer << entry.info_offsets[i]; for(U32 i = 0; i < entry.n_format_streams; ++i) buffer << entry.format_offsets[i]; for(U32 i = 0; i < entry.n_filter_streams; ++i) buffer << entry.filter_offsets[i]; - if(entry.n_info_patterns > 0){ - const BYTE info_bitvector_width = ceil((float)entry.n_info_streams/8); - for(U32 i = 0; i < entry.n_info_patterns; ++i){ - buffer << entry.info_bit_vectors[i]; - buffer.Add((const char* const)&entry.info_bit_vectors[i].bit_bytes[0], info_bitvector_width); - } - } - - if(entry.n_format_patterns > 0){ - const BYTE format_bitvector_width = ceil((float)entry.n_format_streams/8); - for(U32 i = 0; i < entry.n_format_patterns; ++i){ - buffer << entry.format_bit_vectors[i]; - buffer.Add((const char* const)&entry.format_bit_vectors[i].bit_bytes[0], format_bitvector_width); - } - } - - if(entry.n_filter_patterns > 0){ - const BYTE filter_bitvector_width = ceil((float)entry.n_filter_streams/8); - for(U32 i = 0; i < entry.n_filter_patterns; ++i){ - buffer << entry.filter_bit_vectors[i]; - buffer.Add((const char* const)&entry.filter_bit_vectors[i].bit_bytes[0], filter_bitvector_width); - } - } + for(U32 i = 0; i < entry.n_info_patterns; ++i) buffer << entry.info_patterns[i]; + for(U32 i = 0; i < entry.n_format_patterns; ++i) buffer << entry.format_patterns[i]; + for(U32 i = 0; i < entry.n_filter_patterns; ++i) buffer << entry.filter_patterns[i]; return(buffer); } io::BasicBuffer& operator>>(io::BasicBuffer& buffer, VariantBlockFooter& entry){ + entry.reset(); + buffer >> entry.n_info_streams; buffer >> entry.n_format_streams; buffer >> entry.n_filter_streams; @@ -349,66 +112,57 @@ io::BasicBuffer& operator>>(io::BasicBuffer& buffer, VariantBlockFooter& entry){ buffer >> entry.n_format_patterns; buffer >> entry.n_filter_patterns; - entry.l_info_bitvector = ceil((float)entry.n_info_streams/8); - entry.l_format_bitvector = ceil((float)entry.n_format_streams/8); - entry.l_filter_bitvector = ceil((float)entry.n_filter_streams/8); - - buffer >> entry.offset_ppa; - buffer >> entry.offset_meta_contig; - buffer >> entry.offset_meta_position; - buffer >> entry.offset_meta_refalt; - buffer >> entry.offset_meta_controllers; - buffer >> entry.offset_meta_quality; - buffer >> entry.offset_meta_names; - buffer >> entry.offset_meta_alleles; - buffer >> entry.offset_meta_info_id; - buffer >> entry.offset_meta_format_id; - buffer >> entry.offset_meta_filter_id; - buffer >> entry.offset_gt_8b; - buffer >> entry.offset_gt_16b; - buffer >> entry.offset_gt_32b; - buffer >> entry.offset_gt_64b; - buffer >> entry.offset_gt_simple8; - buffer >> entry.offset_gt_simple16; - buffer >> entry.offset_gt_simple32; - buffer >> entry.offset_gt_simple64; - buffer >> entry.offset_gt_helper; + entry.l_info_bitvector = ceil((float)entry.n_info_streams / 8); + entry.l_format_bitvector = ceil((float)entry.n_format_streams / 8); + entry.l_filter_bitvector = ceil((float)entry.n_filter_streams / 8); + entry.BuildMaps(); // Construct new maps. + entry.BuildPatternMaps(); // Construct new pattern maps. + entry.offsets = new DataContainerHeader[YON_BLK_N_STATIC]; entry.info_offsets = new DataContainerHeader[entry.n_info_streams]; entry.format_offsets = new DataContainerHeader[entry.n_format_streams]; entry.filter_offsets = new DataContainerHeader[entry.n_filter_streams]; - for(U32 i = 0; i < entry.n_info_streams; ++i) buffer >> entry.info_offsets[i]; - for(U32 i = 0; i < entry.n_format_streams; ++i) buffer >> entry.format_offsets[i]; - for(U32 i = 0; i < entry.n_filter_streams; ++i) buffer >> entry.filter_offsets[i]; + entry.n_info_patterns_allocated = entry.n_info_streams; + entry.n_format_patterns_allocated = entry.n_format_streams; + entry.n_filter_patterns_allocated = entry.n_filter_streams; + + for(U32 i = 0; i < YON_BLK_N_STATIC; ++i) + buffer >> entry.offsets[i]; + + for(U32 i = 0; i < entry.n_info_streams; ++i){ + buffer >> entry.info_offsets[i]; + entry.UpdateInfo(entry.info_offsets[i], i); + } + + for(U32 i = 0; i < entry.n_format_streams; ++i){ + buffer >> entry.format_offsets[i]; + entry.UpdateFormat(entry.format_offsets[i], i); + } + + for(U32 i = 0; i < entry.n_filter_streams; ++i){ + buffer >> entry.filter_offsets[i]; + entry.UpdateFilter(entry.filter_offsets[i], i); + } - if(entry.n_info_patterns){ - BYTE info_bitvector_width = ceil((float)entry.n_info_streams/8); - entry.info_bit_vectors = new DataBlockBitvector[entry.n_info_patterns]; - for(U32 i = 0; i < entry.n_info_patterns; ++i){ - buffer >> entry.info_bit_vectors[i]; - entry.info_bit_vectors[i].allocate(info_bitvector_width); - buffer.read((char*)entry.info_bit_vectors[i].bit_bytes, info_bitvector_width); - } + entry.info_patterns = new yon_blk_bv_pair[entry.n_info_patterns]; + for(U32 i = 0; i < entry.n_info_patterns; ++i){ + buffer >> entry.info_patterns[i]; + entry.UpdateInfoPattern(entry.info_patterns[i].pattern, i); + entry.info_patterns[i].Build(entry.n_info_streams, entry.info_map); } - if(entry.n_format_patterns){ - BYTE format_bitvector_width = ceil((float)entry.n_format_streams/8); - entry.format_bit_vectors = new DataBlockBitvector[entry.n_format_patterns]; - for(U32 i = 0; i < entry.n_format_patterns; ++i){ - buffer >> entry.format_bit_vectors[i]; - entry.format_bit_vectors[i].allocate(format_bitvector_width); - buffer.read((char*)entry.format_bit_vectors[i].bit_bytes, format_bitvector_width); - } + entry.format_patterns = new yon_blk_bv_pair[entry.n_format_patterns]; + for(U32 i = 0; i < entry.n_format_patterns; ++i){ + buffer >> entry.format_patterns[i]; + entry.UpdateFormatPattern(entry.format_patterns[i].pattern, i); + entry.format_patterns[i].Build(entry.n_format_streams, entry.format_map); } - if(entry.n_filter_patterns){ - BYTE filter_bitvector_width = ceil((float)entry.n_filter_streams/8); - entry.filter_bit_vectors = new DataBlockBitvector[entry.n_filter_patterns]; - for(U32 i = 0; i < entry.n_filter_patterns; ++i){ - buffer >> entry.filter_bit_vectors[i]; - entry.filter_bit_vectors[i].allocate(filter_bitvector_width); - buffer.read((char*)entry.filter_bit_vectors[i].bit_bytes, filter_bitvector_width); - } + entry.filter_patterns = new yon_blk_bv_pair[entry.n_filter_patterns]; + for(U32 i = 0; i < entry.n_filter_patterns; ++i){ + buffer >> entry.filter_patterns[i]; + entry.UpdateFilterPattern(entry.filter_patterns[i].pattern, i); + entry.filter_patterns[i].Build(entry.n_filter_streams, entry.filter_map); } return(buffer); diff --git a/lib/containers/components/variant_block_footer.h b/lib/containers/components/variant_block_footer.h index e628ee8..f8b3dce 100644 --- a/lib/containers/components/variant_block_footer.h +++ b/lib/containers/components/variant_block_footer.h @@ -1,37 +1,199 @@ #ifndef CONTAINERS_COMPONENTS_VARIANT_BLOCK_FOOTER_H_ #define CONTAINERS_COMPONENTS_VARIANT_BLOCK_FOOTER_H_ -#include "data_block_bitvector.h" +#include + #include "data_container_header.h" -#include "containers/hash_container.h" #include "io/basic_buffer.h" -#include "algorithm/OpenHashTable.h" -namespace tachyon{ -namespace containers{ +#include "third_party/xxhash/xxhash.h" -struct VariantBlockFooter{ -private: - typedef VariantBlockFooter self_type; - typedef DataBlockBitvector bit_vector; - typedef hash::HashTable hash_table; - typedef std::vector id_vector; - typedef std::vector< id_vector > pattern_vector; - typedef containers::HashContainer hash_container_type; - typedef containers::HashVectorContainer hash_vector_container_type; - typedef DataContainerHeader header_type; +namespace tachyon { + +#define YON_BLK_N_STATIC 25// Total number of invariant headers +#define YON_BLK_PPA 0 // Sample permutation array +#define YON_BLK_CONTIG 1 +#define YON_BLK_POSITION 2 +#define YON_BLK_REFALT 3 +#define YON_BLK_CONTROLLER 4 // Set memberships +#define YON_BLK_QUALITY 5 +#define YON_BLK_NAMES 6 +#define YON_BLK_ALLELES 7 +#define YON_BLK_ID_INFO 8 +#define YON_BLK_ID_FORMAT 9 +#define YON_BLK_ID_FILTER 10 +#define YON_BLK_GT_INT8 11 // Run-length encoded genotypes +#define YON_BLK_GT_INT16 12 +#define YON_BLK_GT_INT32 13 +#define YON_BLK_GT_INT64 14 +#define YON_BLK_GT_S_INT8 15 // Standard encoded genotypes +#define YON_BLK_GT_S_INT16 16 +#define YON_BLK_GT_S_INT32 17 +#define YON_BLK_GT_S_INT64 18 +#define YON_BLK_GT_N_INT8 19 // Standard encoded genotypes +#define YON_BLK_GT_N_INT16 20 +#define YON_BLK_GT_N_INT32 21 +#define YON_BLK_GT_N_INT64 22 +#define YON_BLK_GT_SUPPORT 23 // Genotype support +#define YON_BLK_GT_PLOIDY 24 // Genotype ploidy + +#define YON_BLK_BV_PPA 1 << YON_BLK_PPA +#define YON_BLK_BV_CONTIG 1 << YON_BLK_CONTIG +#define YON_BLK_BV_POSITION 1 << YON_BLK_POSITION +#define YON_BLK_BV_REFALT 1 << YON_BLK_REFALT +#define YON_BLK_BV_CONTROLLER 1 << YON_BLK_CONTROLLER +#define YON_BLK_BV_QUALITY 1 << YON_BLK_QUALITY +#define YON_BLK_BV_NAMES 1 << YON_BLK_NAMES +#define YON_BLK_BV_ALLELES 1 << YON_BLK_ALLELES +#define YON_BLK_BV_ID_INFO 1 << YON_BLK_ID_INFO +#define YON_BLK_BV_ID_FORMAT 1 << YON_BLK_ID_FORMAT +#define YON_BLK_BV_ID_FILTER 1 << YON_BLK_ID_FILTER +#define YON_BLK_BV_GT_INT8 1 << YON_BLK_GT_INT8 +#define YON_BLK_BV_GT_INT16 1 << YON_BLK_GT_INT16 +#define YON_BLK_BV_GT_INT32 1 << YON_BLK_GT_INT32 +#define YON_BLK_BV_GT_INT64 1 << YON_BLK_GT_INT64 +#define YON_BLK_BV_GT_S_INT8 1 << YON_BLK_GT_S_INT8 +#define YON_BLK_BV_GT_S_INT16 1 << YON_BLK_GT_S_INT16 +#define YON_BLK_BV_GT_S_INT32 1 << YON_BLK_GT_S_INT32 +#define YON_BLK_BV_GT_S_INT64 1 << YON_BLK_GT_S_INT64 +#define YON_BLK_BV_GT_N_INT8 1 << YON_BLK_GT_N_INT8 +#define YON_BLK_BV_GT_N_INT16 1 << YON_BLK_GT_N_INT16 +#define YON_BLK_BV_GT_N_INT32 1 << YON_BLK_GT_N_INT32 +#define YON_BLK_BV_GT_N_INT64 1 << YON_BLK_GT_N_INT64 +#define YON_BLK_BV_GT_SUPPORT 1 << YON_BLK_GT_SUPPORT +#define YON_BLK_BV_GT_PLOIDY 1 << YON_BLK_GT_PLOIDY + +#define YON_BLK_BV_INFO 1 << (YON_BLK_N_STATIC) +#define YON_BLK_BV_FORMAT 1 << (YON_BLK_N_STATIC + 1) +#define YON_BLK_BV_GT ((YON_BLK_BV_GT_INT8)|(YON_BLK_BV_GT_INT16)|(YON_BLK_BV_GT_INT32)|(YON_BLK_BV_GT_INT64)|(YON_BLK_BV_GT_S_INT8)|(YON_BLK_BV_GT_S_INT16)|(YON_BLK_BV_GT_S_INT32)|(YON_BLK_BV_GT_S_INT64)|(YON_BLK_BV_GT_N_INT8)|(YON_BLK_BV_GT_N_INT16)|(YON_BLK_BV_GT_N_INT32)|(YON_BLK_BV_GT_N_INT64)|(YON_BLK_BV_GT_SUPPORT)|(YON_BLK_BV_GT_PLOIDY)) + +namespace containers { + +struct yon_blk_bv_pair { + yon_blk_bv_pair() : l_bytes(0), bit_bytes(nullptr){} + ~yon_blk_bv_pair(){ delete [] this->bit_bytes; } + + void clear(void){ + this->pattern.clear(); + this->l_bytes = 0; + delete [] this->bit_bytes; + this->bit_bytes = nullptr; + } + + // Bit access + inline bool operator[](const U32 position) const{ return((this->bit_bytes[position / 8] & (1 << (position % 8))) >> (position % 8)); } + + + // Given the total number of fields allocate ceil(n_total_fields/8) + // bytes for the base array. + void Build(const U32 n_total_fields, const std::unordered_map* local_map){ + if(this->pattern.size() == 0) return; + assert(local_map != nullptr); + + // Determine the required width in bytes of the bit-vector + BYTE bitvector_width = ceil((float)(n_total_fields+1)/8); + + // Allocate new bit-vectors + delete [] this->bit_bytes; + this->l_bytes = bitvector_width; + this->bit_bytes = new uint8_t[bitvector_width]; + + // Cycle over pattern size + for(U32 i = 0; i < this->pattern.size(); ++i){ + std::unordered_map::const_iterator it = local_map->find(this->pattern[i]); + assert(it != local_map->end()); + + // Map from absolute key to local key. + U32 local_key = it->second; + assert(local_key <= n_total_fields); + + // Set bit at local key position + this->bit_bytes[local_key/8] |= 1 << (local_key % 8); + } + } + + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const yon_blk_bv_pair& entry){ + io::SerializePrimitive(entry.l_bytes, buffer); + buffer += (U32)entry.pattern.size(); + for(U32 i = 0; i < entry.pattern.size(); ++i) + io::SerializePrimitive(entry.pattern[i], buffer); + + for(U32 i = 0; i < entry.l_bytes; ++i) + io::SerializePrimitive(entry.bit_bytes[i], buffer); + + + return(buffer); + } + + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, yon_blk_bv_pair& entry){ + entry.pattern.clear(); + io::DeserializePrimitive(entry.l_bytes, buffer); + U32 l_vector; + buffer >> l_vector; + //entry.pattern.resize(l_vector); + for(U32 i = 0; i < l_vector; ++i){ + int temp; + io::DeserializePrimitive(temp, buffer); + //entry.pattern[i] = temp; + entry.pattern.push_back(temp); + } + + entry.bit_bytes = new BYTE[entry.l_bytes]; + for(U32 i = 0; i < entry.l_bytes; ++i) + io::DeserializePrimitive(entry.bit_bytes[i], buffer); + + return(buffer); + } + + yon_blk_bv_pair& operator=(const yon_blk_bv_pair& other){ + delete [] this->bit_bytes; + this->pattern = other.pattern; + this->l_bytes = other.l_bytes; + this->bit_bytes = new uint8_t[this->l_bytes]; + memcpy(this->bit_bytes, other.bit_bytes, this->l_bytes); + return(*this); + } + + yon_blk_bv_pair& operator=(yon_blk_bv_pair&& other) noexcept{ + if (this == &other){ + // take precautions against self-moves + return *this; + } + + delete [] this->bit_bytes; + this->bit_bytes = other.bit_bytes; + other.bit_bytes = nullptr; + this->pattern = std::move(other.pattern); + this->l_bytes = other.l_bytes; + return(*this); + } public: - // Internal use only - enum INDEX_BLOCK_TARGET{INDEX_INFO, INDEX_FORMAT, INDEX_FILTER}; + std::vector pattern; + uint8_t l_bytes; + uint8_t* bit_bytes; +}; + +// It is possible of getting mapping local indices to global IDX +// for either FILTER/FORMAT/INFO fields by iterating over the +// relevant DataContainerHeader structures and tracking the incremental +// position they occur at (local IDX) and read the global IDX in the +// header structure. +struct VariantBlockFooter { +public: + typedef VariantBlockFooter self_type; + typedef DataContainerHeader header_type; + typedef std::unordered_map map_type; + typedef std::unordered_map map_pattern_type; public: VariantBlockFooter(); ~VariantBlockFooter(); void reset(void); + void resetTables(void); // Allocate offset vectors - inline void allocateInfoDiskOffsets(const U32 n_info_streams){ + inline void AllocateInfoHeaders(const U32 n_info_streams){ delete [] this->info_offsets; if(n_info_streams == 0){ this->info_offsets = nullptr; @@ -40,7 +202,7 @@ struct VariantBlockFooter{ this->info_offsets = new header_type[n_info_streams]; } - inline void allocateFormatDiskOffsets(const U32 n_format_streams){ + inline void AllocateFormatHeaders(const U32 n_format_streams){ delete [] this->format_offsets; if(n_format_streams == 0){ this->format_offsets = nullptr; @@ -49,7 +211,7 @@ struct VariantBlockFooter{ this->format_offsets = new header_type[n_format_streams]; } - inline void allocateFilterDiskOffsets(const U32 n_filter_streams){ + inline void AllocateFilterHeaders(const U32 n_filter_streams){ delete [] this->filter_offsets; if(n_filter_streams == 0){ this->filter_offsets = nullptr; @@ -65,84 +227,358 @@ struct VariantBlockFooter{ * @param n_format_streams Number of unique format streams * @param n_filter_streams Number of unique filter streams */ - inline void allocateDiskOffsets(const U32 n_info_streams, const U32 n_format_streams, const U32 n_filter_streams){ - this->allocateInfoDiskOffsets(n_info_streams); - this->allocateFormatDiskOffsets(n_format_streams); - this->allocateFilterDiskOffsets(n_filter_streams); + inline void AllocateHeaders(const U32 n_info_streams, + const U32 n_format_streams, + const U32 n_filter_streams) + { + this->AllocateInfoHeaders(n_info_streams); + this->AllocateFormatHeaders(n_format_streams); + this->AllocateFilterHeaders(n_filter_streams); + } + + bool ConstructInfoBitVector(std::unordered_map* pattern_map){ + for(U32 i = 0; i < this->n_info_patterns; ++i){ + this->info_patterns[i].Build(this->n_info_streams, pattern_map); + } + return true; + } + + bool ConstructFormatBitVector(std::unordered_map* pattern_map){ + for(U32 i = 0; i < this->n_format_patterns; ++i){ + this->format_patterns[i].Build(this->n_format_streams, pattern_map); + } + return true; + } + + bool ConstructFilterBitVector(std::unordered_map* pattern_map){ + for(U32 i = 0; i < this->n_filter_patterns; ++i){ + this->filter_patterns[i].Build(this->n_filter_streams, pattern_map); + } + return true; + } + + U32 AddPatternWrapper(const std::vector& pattern, + map_pattern_type* pattern_map, + yon_blk_bv_pair* bv_pairs, + U16& stream_counter) + { + U64 pattern_hash = VariantBlockFooter::HashIdentifiers(pattern); + const map_pattern_type::const_iterator it = pattern_map->find(pattern_hash); // search for pattern + if(it == pattern_map->end()){ + (*pattern_map)[pattern_hash] = stream_counter; + bv_pairs[stream_counter].pattern = pattern; + ++stream_counter; + } + + return((*pattern_map)[pattern_hash]); + } + + U32 AddInfoPattern(const std::vector& pattern){ + if(this->info_pattern_map == nullptr) this->BuildPatternMaps(); + if(this->n_info_patterns_allocated == 0){ + delete [] this->info_patterns; + this->info_patterns = new yon_blk_bv_pair[100]; + this->n_info_patterns_allocated = 100; + } + + // Resize if required. + if(this->n_info_patterns == this->n_info_patterns_allocated){ + yon_blk_bv_pair* temp = this->info_patterns; + + this->info_patterns = new yon_blk_bv_pair[this->n_info_patterns_allocated*2]; + for(U32 i = 0; i < this->n_info_patterns_allocated; ++i){ + this->info_patterns[i] = std::move(temp[i]); + } + this->n_info_patterns_allocated *= 2; + delete [] temp; + } + + return(this->AddPatternWrapper(pattern, + this->info_pattern_map, + this->info_patterns, + this->n_info_patterns)); + } + + U32 AddFormatPattern(const std::vector& pattern){ + if(this->format_pattern_map == nullptr) this->BuildPatternMaps(); + if(this->n_format_patterns_allocated == 0){ + delete [] this->format_patterns; + this->format_patterns = new yon_blk_bv_pair[100]; + this->n_format_patterns_allocated = 100; + } + + // Resize if required. + if(this->n_format_patterns == this->n_format_patterns_allocated){ + yon_blk_bv_pair* temp = this->format_patterns; + + this->format_patterns = new yon_blk_bv_pair[this->n_format_patterns_allocated*2]; + for(U32 i = 0; i < this->n_format_patterns_allocated; ++i){ + this->format_patterns[i] = std::move(temp[i]); + } + this->n_format_patterns_allocated *= 2; + delete [] temp; + } + + return(this->AddPatternWrapper(pattern, + this->format_pattern_map, + this->format_patterns, + this->n_format_patterns)); + } + + U32 AddFilterPattern(const std::vector& pattern){ + if(this->filter_pattern_map == nullptr) this->BuildPatternMaps(); + if(this->n_filter_patterns_allocated == 0){ + delete [] this->filter_patterns; + this->filter_patterns = new yon_blk_bv_pair[100]; + this->n_filter_patterns_allocated = 100; + } + + // Resize if required. + if(this->n_filter_patterns == this->n_filter_patterns_allocated){ + yon_blk_bv_pair* temp = this->filter_patterns; + + this->filter_patterns = new yon_blk_bv_pair[this->n_filter_patterns_allocated*2]; + for(U32 i = 0; i < this->n_filter_patterns_allocated; ++i){ + this->filter_patterns[i] = std::move(temp[i]); + } + this->n_filter_patterns_allocated *= 2; + delete [] temp; + } + + return(this->AddPatternWrapper(pattern, + this->filter_pattern_map, + this->filter_patterns, + this->n_filter_patterns)); + } + + // This wrapper adds patterns to the hash map when the data has + // already been loaded. This occurs when loading an object from + // disk/buffer. + U32 UpdatePatternWrapper(const std::vector& pattern, + map_pattern_type* pattern_map, + const U16& stream_counter) + { + U64 pattern_hash = VariantBlockFooter::HashIdentifiers(pattern); + const map_pattern_type::const_iterator it = pattern_map->find(pattern_hash); // search for pattern + if(it == pattern_map->end()) + (*pattern_map)[pattern_hash] = stream_counter; + + return((*pattern_map)[pattern_hash]); + } + + U32 UpdateInfoPattern(const std::vector& pattern, const U16 pattern_id){ + if(this->info_pattern_map == nullptr) this->BuildPatternMaps(); + if(this->n_info_patterns_allocated == 0){ + delete [] this->info_patterns; + this->info_patterns = new yon_blk_bv_pair[100]; + this->n_info_patterns_allocated = 100; + } + return(this->UpdatePatternWrapper(pattern, this->info_pattern_map, pattern_id)); + } + + U32 UpdateFormatPattern(const std::vector& pattern, const U16 pattern_id){ + if(this->format_pattern_map == nullptr) this->BuildPatternMaps(); + if(this->n_format_patterns_allocated == 0){ + delete [] this->format_patterns; + this->format_patterns = new yon_blk_bv_pair[100]; + this->n_format_patterns_allocated = 100; + } + return(this->UpdatePatternWrapper(pattern, this->format_pattern_map, pattern_id)); + } + + U32 UpdateFilterPattern(const std::vector& pattern, const U16 pattern_id){ + if(this->filter_pattern_map == nullptr) this->BuildPatternMaps(); + if(this->n_filter_patterns_allocated == 0){ + delete [] this->filter_patterns; + this->filter_patterns = new yon_blk_bv_pair[100]; + this->n_filter_patterns_allocated = 100; + } + return(this->UpdatePatternWrapper(pattern, this->filter_pattern_map, pattern_id)); + } + + void Finalize(void){ + this->ConstructInfoBitVector(this->info_map); + this->ConstructFormatBitVector(this->format_map); + this->ConstructFilterBitVector(this->filter_map); + } + + bool BuildMaps(void){ + delete this->info_map; + delete this->filter_map; + delete this->format_map; + + this->info_map = new map_type(); + this->filter_map = new map_type(); + this->format_map = new map_type(); + + return true; + } + + bool BuildPatternMaps(void){ + delete this->info_pattern_map; + this->info_pattern_map = new map_pattern_type(); + if(this->n_info_patterns_allocated == 0){ + this->info_patterns = new yon_blk_bv_pair[100]; + this->n_info_patterns_allocated = 100; + } + + delete this->filter_pattern_map; + this->filter_pattern_map = new map_pattern_type(); + if(this->n_filter_patterns_allocated == 0){ + this->filter_patterns = new yon_blk_bv_pair[100]; + this->n_filter_patterns_allocated = 100; + } + + delete this->format_pattern_map; + this->format_pattern_map = new map_pattern_type(); + if(this->n_format_patterns_allocated == 0){ + this->format_patterns = new yon_blk_bv_pair[100]; + this->n_format_patterns_allocated = 100; + } + + return true; + } + + U32 UpdateOffsetMapWrapper(const header_type& offset, map_type* map, const U16& stream_counter){ + map_type::const_iterator it = map->find(offset.data_header.global_key); + if(it == map->end()) + (*map)[offset.data_header.global_key] = stream_counter; + + return((*map)[offset.data_header.global_key]); + } + + U32 UpdateInfo(const header_type& offset, const U16 position){ + if(this->info_map == nullptr) this->BuildMaps(); + return(this->UpdateOffsetMapWrapper(offset, this->info_map, position)); + } + + U32 UpdateFormat(const header_type& offset, const U16 position){ + if(this->format_map == nullptr) this->BuildMaps(); + return(this->UpdateOffsetMapWrapper(offset, this->format_map, position)); + } + + U32 UpdateFilter(const header_type& offset, const U16 position){ + if(this->filter_map == nullptr) this->BuildMaps(); + return(this->UpdateOffsetMapWrapper(offset, this->filter_map, position)); + } + + U32 AddStreamWrapper(const U32 id, map_type* map, header_type*& offsets, U16& stream_counter){ + map_type::const_iterator it = map->find(id); + if(it == map->end()){ + (*map)[id] = stream_counter; + offsets[stream_counter].data_header.global_key = id; + ++stream_counter; + } + + return((*map)[id]); + } + + U32 AddInfo(const U32 id){ + if(this->info_map == nullptr) this->BuildMaps(); + return(this->AddStreamWrapper(id, this->info_map, this->info_offsets, this->n_info_streams)); + } + + U32 AddFormat(const U32 id){ + if(this->format_map == nullptr) this->BuildMaps(); + return(this->AddStreamWrapper(id, this->format_map, this->format_offsets, this->n_format_streams)); + } + + U32 AddFilter(const U32 id){ + if(this->filter_map == nullptr) this->BuildMaps(); + return(this->AddStreamWrapper(id, this->filter_map, this->filter_offsets, this->n_filter_streams)); } /**< - * Wrapper function for constructing INFO/FORMAT/FILTER pattern - * set-membership bit-vectors - * @param target Target group (INFO/FORMAT/FILTER) - * @param values Hash container of values - * @param patterns Hash container of patterns - * @return Returns TRUE upon success or FALSE otherwise - */ - bool constructBitVector(const INDEX_BLOCK_TARGET& target, hash_container_type& values, hash_vector_container_type& patterns); + * Static function that calculates the 64-bit hash value for the target + * FORMAT/FILTER/INFO vector of id fields. The id fields must be of type + * int (S32). Example of using this function: + * + * const U64 hash_value = VariantImporter::HashIdentifiers(id_vector); + * + * @param id_vector Input vector of FORMAT/FILTER/INFO identifiers. + * @return Returns a 64-bit hash value. + */ + static U64 HashIdentifiers(const std::vector& id_vector){ + XXH64_state_t* const state = XXH64_createState(); + if (state==NULL) abort(); -private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry); - friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry); - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& entry); - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& entry); + XXH_errorcode const resetResult = XXH64_reset(state, 71236251); + if (resetResult == XXH_ERROR) abort(); + + for(U32 i = 0; i < id_vector.size(); ++i){ + XXH_errorcode const addResult = XXH64_update(state, (const void*)&id_vector[i], sizeof(int)); + if (addResult == XXH_ERROR) abort(); + } + + U64 hash = XXH64_digest(state); + XXH64_freeState(state); + return hash; + } private: - /**< - * - * @param target - * @param offset - * @param values - * @param patterns - * @return - */ - bool __constructBitVector(bit_vector*& target, header_type* offset, hash_container_type& values, hash_vector_container_type& patterns); + //friend std::ostream& operator<<(std::ostream& stream, const self_type& entry); + //friend std::istream& operator>>(std::ifstream& stream, self_type& entry); + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& entry); + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& entry); public: - // Not written or read from disk - // Used internally only + // Utility members. l_*_bitvector stores the byte-length + // of each of the bit-vectors described below. + // Not written or read from disk. Used internally only BYTE l_info_bitvector; BYTE l_format_bitvector; BYTE l_filter_bitvector; - // Counters - U16 n_info_streams; + // Critical values used to track the number of data streams + // that is available for each possible type. The n_*_streams + // fields corresponds to the number of different byte streams + // that are set in this block. The n_*_patterns corresponds + // to the number of unique vectors of field identifiers that + // occurred in the block. + U16 n_info_streams; // streams U16 n_format_streams; U16 n_filter_streams; - U16 n_info_patterns; + U16 n_info_patterns; // patterns U16 n_format_patterns; U16 n_filter_patterns; - // Headers of the various containers - header_type offset_ppa; - header_type offset_meta_contig; - header_type offset_meta_position; - header_type offset_meta_refalt; - header_type offset_meta_controllers; - header_type offset_meta_quality; - header_type offset_meta_names; - header_type offset_meta_alleles; - header_type offset_meta_info_id; - header_type offset_meta_format_id; - header_type offset_meta_filter_id; - header_type offset_gt_8b; - header_type offset_gt_16b; - header_type offset_gt_32b; - header_type offset_gt_64b; - header_type offset_gt_simple8; - header_type offset_gt_simple16; - header_type offset_gt_simple32; - header_type offset_gt_simple64; - header_type offset_gt_helper; + // Header structures corresponds critical information regarding + // the global IDX and virtual byte offset to the start of each + // compressed and possibly encrypted byte stream. In addition, + // this structure details the primitive type of the data in the + // stream and its stride size (consecutive elements / entry) for + // both the data itself and the stride themselves. + // Note that only INFO/FORMAT/FILTER fields have IDX fields. The + // other fields do not require dictionary lookup to ascertain + // their identity as they are guaranteed to be invariant. + header_type* offsets; header_type* info_offsets; header_type* format_offsets; header_type* filter_offsets; - // Bit vectors - bit_vector* info_bit_vectors; - bit_vector* format_bit_vectors; - bit_vector* filter_bit_vectors; + // Bit-vectors of INFO/FORMAT/FILTER vectors of local IDX + // patterns. These bit-vectors are used to quickly check + // for the set membership of a given global and/or local IDX. + // The bit-vectors internally holds the actual vector of IDX + // for internal use. Construction of these bit-vectors are not + // critical for basic functionality but critical for the + // restoration of a bit-exact output sequence of fields. + U32 n_info_patterns_allocated; + U32 n_format_patterns_allocated; + U32 n_filter_patterns_allocated; + yon_blk_bv_pair* info_patterns; + yon_blk_bv_pair* format_patterns; + yon_blk_bv_pair* filter_patterns; + + // Supportive hash tables to permit the map from global + // IDX fields to local IDX fields. + map_type* info_map; + map_type* format_map; + map_type* filter_map; + map_pattern_type* info_pattern_map; + map_pattern_type* format_pattern_map; + map_pattern_type* filter_pattern_map; }; } diff --git a/lib/containers/components/variant_block_header.cpp b/lib/containers/components/variant_block_header.cpp index 3cb56cc..5df287c 100644 --- a/lib/containers/components/variant_block_header.cpp +++ b/lib/containers/components/variant_block_header.cpp @@ -9,7 +9,7 @@ namespace containers{ VariantBlockHeader::VariantBlockHeader() : l_offset_footer(0), - blockID(0), + block_hash(0), contigID(-1), minPosition(0), maxPosition(0), diff --git a/lib/containers/components/variant_block_header.h b/lib/containers/components/variant_block_header.h index ae5b8fa..eefd862 100644 --- a/lib/containers/components/variant_block_header.h +++ b/lib/containers/components/variant_block_header.h @@ -4,9 +4,7 @@ #include #include -#include "data_block_bitvector.h" #include "data_container_header.h" -#include "containers/hash_container.h" #include "io/basic_buffer.h" #include "data_container_header_controller.h" @@ -76,12 +74,12 @@ struct VariantBlockHeader{ inline const S32& getContigID(void) const{ return(this->contigID); } inline const S64& getMinPosition(void) const{ return(this->minPosition); } inline const S64& getMaxPosition(void) const{ return(this->maxPosition); } - inline U64& getBlockID(void){ return(this->blockID); } - inline const U64& getBlockID(void) const{ return(this->blockID); } + inline U64& getBlockID(void){ return(this->block_hash); } + inline const U64& getBlockID(void) const{ return(this->block_hash); } friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ stream.write(reinterpret_cast(&entry.l_offset_footer), sizeof(U32)); - stream.write(reinterpret_cast(&entry.blockID), sizeof(U64)); + stream.write(reinterpret_cast(&entry.block_hash), sizeof(U64)); stream << entry.controller; stream.write(reinterpret_cast(&entry.contigID), sizeof(U32)); stream.write(reinterpret_cast(&entry.minPosition), sizeof(S64)); @@ -93,7 +91,7 @@ struct VariantBlockHeader{ friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry){ stream.read(reinterpret_cast(&entry.l_offset_footer), sizeof(U32)); - stream.read(reinterpret_cast(&entry.blockID), sizeof(U64)); + stream.read(reinterpret_cast(&entry.block_hash), sizeof(U64)); stream >> entry.controller; stream.read(reinterpret_cast(&entry.contigID), sizeof(U32)); stream.read(reinterpret_cast(&entry.minPosition), sizeof(S64)); @@ -105,7 +103,7 @@ struct VariantBlockHeader{ void reset(void){ this->l_offset_footer = 0; - this->blockID = 0; + this->block_hash = 0; this->controller.clear(); this->contigID = -1; this->minPosition = 0; @@ -114,11 +112,10 @@ struct VariantBlockHeader{ } public: - // allows jumping to the next block when streaming - // over the file and not using the index + // Allows jumping to the next block when streaming. // EOF marker is at this position - sizeof(EOF marker) U32 l_offset_footer; - U64 blockID; // block identifier in the form of a random hash + U64 block_hash; // block identifier in the form of a random hash controller_type controller; S32 contigID; // contig identifier S64 minPosition; // minimum coordinate in this block diff --git a/lib/containers/components/variant_block_mapper_entry.h b/lib/containers/components/variant_block_mapper_entry.h deleted file mode 100644 index bdb65d0..0000000 --- a/lib/containers/components/variant_block_mapper_entry.h +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef CONTAINERS_COMPONENTS_VARIANT_BLOCK_MAPPER_ENTRY_H_ -#define CONTAINERS_COMPONENTS_VARIANT_BLOCK_MAPPER_ENTRY_H_ - -#include "data_container_header.h" - -namespace tachyon{ -namespace containers{ - -struct VariantBlockMapperEntry { -public: - typedef VariantBlockMapperEntry self_type; - typedef containers::DataContainerHeader header_type; - -public: - VariantBlockMapperEntry() : - load_order_index(-1), - stream_id_local(-1), - stream_id_global(-1), - offset(nullptr) - {} - - VariantBlockMapperEntry(const U32 load_order_index, const S32 target_stream_disk, const header_type* offset) : - load_order_index(load_order_index), - stream_id_local(target_stream_disk), - stream_id_global(offset->data_header.global_key), - offset(offset) - {} - - ~VariantBlockMapperEntry(){} - - VariantBlockMapperEntry(const VariantBlockMapperEntry& other) : - load_order_index(other.load_order_index), - stream_id_local(other.stream_id_local), - stream_id_global(other.stream_id_global), - offset(other.offset) - {} - - VariantBlockMapperEntry(VariantBlockMapperEntry&& other) : - load_order_index(other.load_order_index), - stream_id_local(other.stream_id_local), - stream_id_global(other.stream_id_global), - offset(other.offset) - {} - - VariantBlockMapperEntry& operator=(const VariantBlockMapperEntry& other){ - this->load_order_index = other.load_order_index; - this->stream_id_local = other.stream_id_local; - this->stream_id_global = other.stream_id_global; - this->offset = other.offset; - return *this; - } - - VariantBlockMapperEntry& operator=(VariantBlockMapperEntry&& other){ - if(this!=&other) // prevent self-move - { - this->load_order_index = other.load_order_index; - this->stream_id_local = other.stream_id_local; - this->stream_id_global = other.stream_id_global; - this->offset = other.offset; - } - return *this; - } - - inline bool operator<(const self_type& other) const{ return(this->offset->data_header.offset < other.offset->data_header.offset); } - inline bool operator>(const self_type& other) const{ return(!((*this) < other)); } - - void operator()(const U32& load_order_index, const U32& stream_id_local, const S32& stream_id_global, const header_type* offset){ - this->load_order_index = load_order_index; - this->stream_id_local = stream_id_local; - this->stream_id_global = stream_id_global; - this->offset = offset; - } - -public: - S32 load_order_index; // Loaded order index - S32 stream_id_local; // Local target index - S32 stream_id_global; // Global target index - const header_type* offset; // Header object of target data container -}; - -} -} - - - -#endif /* CONTAINERS_COMPONENTS_VARIANT_BLOCK_MAPPER_ENTRY_H_ */ diff --git a/lib/containers/data_container.cpp b/lib/containers/data_container.cpp index 2520965..d62c3dc 100644 --- a/lib/containers/data_container.cpp +++ b/lib/containers/data_container.cpp @@ -1,12 +1,11 @@ #include #include "data_container.h" -#include "algorithm/OpenHashTable.h" #include "io/basic_buffer.h" #include "support/helpers.h" #include "support/type_definitions.h" -#include "third_party/zlib/zconf.h" -#include "third_party/zlib/zlib.h" + +#include "algorithm/digest/variant_digest_manager.h" namespace tachyon{ namespace containers{ @@ -38,69 +37,49 @@ void DataContainer::resize(const U32 size){ this->buffer_strides_uncompressed.resize(size); } -void DataContainer::generateCRC(void){ - if(this->buffer_data_uncompressed.size() == 0){ - this->header.data_header.crc = 0; - } else { - // Checksum for main buffer - U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)this->buffer_data_uncompressed.buffer, this->buffer_data_uncompressed.size()); - this->header.data_header.crc = crc; - } - - if(this->header.data_header.hasMixedStride()){ - if(this->buffer_data_uncompressed.size() == 0){ - this->header.stride_header.crc = 0; - } else { - // Checksum for strides - U32 crc = crc32(0, NULL, 0); - if(this->buffer_strides_uncompressed.size() > 0){ - crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)this->buffer_strides_uncompressed.buffer, this->buffer_strides_uncompressed.size()); - this->header.stride_header.crc = crc; - } - } - } +void DataContainer::GenerateMd5(void){ + algorithm::VariantDigestManager::GenerateMd5(this->buffer_data_uncompressed.data(), this->buffer_data_uncompressed.size(), &this->header.data_header.crc[0]); + algorithm::VariantDigestManager::GenerateMd5(this->buffer_strides_uncompressed.data(), this->buffer_strides_uncompressed.size(), &this->header.stride_header.crc[0]); } -bool DataContainer::checkCRC(int target){ +bool DataContainer::CheckMd5(int target){ if(target == 0){ if(this->buffer_data_uncompressed.size() == 0) return true; // Checksum for main buffer - U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)this->buffer_data_uncompressed.buffer, this->buffer_data_uncompressed.size()); - return(crc == this->header.data_header.crc); + uint8_t md5_compare[MD5_DIGEST_LENGTH]; + algorithm::VariantDigestManager::GenerateMd5(this->buffer_data_uncompressed.data(), this->buffer_data_uncompressed.size(), &md5_compare[0]); + return(this->header.data_header.CheckChecksum(md5_compare)); + } else if(target == 1){ if(this->buffer_strides_uncompressed.size() == 0) return true; - // Checksum for main buffer - U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)this->buffer_strides_uncompressed.buffer, this->buffer_strides_uncompressed.size()); - return(crc == this->header.stride_header.crc); + uint8_t md5_compare[MD5_DIGEST_LENGTH]; + algorithm::VariantDigestManager::GenerateMd5(this->buffer_strides_uncompressed.data(), this->buffer_strides_uncompressed.size(), &md5_compare[0]); + return(this->header.stride_header.CheckChecksum(md5_compare)); + } else if(target == 3){ if(this->buffer_data.size() == 0) return true; - // Checksum for main buffer - U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)this->buffer_data.buffer, this->buffer_data.size()); - return(crc == this->header.data_header.crc); + uint8_t md5_compare[MD5_DIGEST_LENGTH]; + algorithm::VariantDigestManager::GenerateMd5(this->buffer_data.data(), this->buffer_data.size(), &md5_compare[0]); + return(this->header.data_header.CheckChecksum(md5_compare)); + } else if(target == 4){ if(this->buffer_strides.size() == 0) return true; - // Checksum for main buffer - U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)this->buffer_strides.buffer, this->buffer_strides.size()); - return(crc == this->header.stride_header.crc); + uint8_t md5_compare[MD5_DIGEST_LENGTH]; + algorithm::VariantDigestManager::GenerateMd5(this->buffer_strides.data(), this->buffer_strides.size(), &md5_compare[0]); + return(this->header.stride_header.CheckChecksum(md5_compare)); } return true; } -bool DataContainer::checkUniformity(void){ +bool DataContainer::CheckUniformity(void){ if(this->header.n_entries == 0) return false; @@ -146,58 +125,66 @@ bool DataContainer::checkUniformity(void){ return(true); } -void DataContainer::reformatInteger(){ +void DataContainer::ReformatInteger(){ if(this->buffer_data_uncompressed.size() == 0) return; - // Recode integer types only - if(!(this->header.data_header.controller.type == YON_TYPE_32B && this->header.data_header.controller.signedness == 1)){ + // Recode integer types only. + if(!(this->header.data_header.controller.type == YON_TYPE_32B && + this->header.data_header.controller.signedness == true)) + { return; } - // Do not recode if the data is uniform - if(this->header.data_header.isUniform()) + // Do not recode if the data is uniform. + if(this->header.data_header.IsUniform()) return; - // At this point all integers are S32 + // At this point all integers are signed 32-bit integers. + assert(this->buffer_data_uncompressed.size() % sizeof(S32) == 0); + assert(this->header.n_additions * sizeof(S32) == this->buffer_data_uncompressed.size()); const S32* const dat = reinterpret_cast(this->buffer_data_uncompressed.data()); - const U32* const udat = reinterpret_cast(this->buffer_data_uncompressed.data()); - S32 min = dat[0]; - S32 max = dat[0]; - bool hasMissing = false; + S32 min_value = dat[0]; + S32 max_value = dat[0]; + bool has_special = false; + // Iterate over available data and search for either missingness + // or sentinel node values. If a match is found trigger the + // bool flag to signal the use of the recoding procedure. for(U32 j = 0; j < this->header.n_additions; ++j){ - if(udat[j] == 0x80000000 || udat[j] == 0x80000001){ - hasMissing = true; + if(dat[j] == bcf_int32_missing || dat[j] == bcf_int32_vector_end){ + has_special = true; continue; } - if(dat[j] < min) min = dat[j]; - if(dat[j] > max) max = dat[j]; + if(dat[j] < min_value) min_value = dat[j]; + if(dat[j] > max_value) max_value = dat[j]; } + // If we have missing values then we have to use signed + // primitives to accommodate this fact. BYTE byte_width = 0; - // If we have missing values then we have to use signedness - if(min < 0 || hasMissing == true){ - byte_width = ceil((ceil(log2(abs(min) + 1 + 2)) + 1) / 8); // One bit is used for sign, 2 values for missing and end-of-vector - const BYTE byte_width_max = ceil((ceil(log2(abs(max) + 1 + 2)) + 1) / 8); + if(min_value < 0 || has_special == true){ + byte_width = ceil((ceil(log2(abs(min_value) + 1 + 2)) + 1) / 8); // One bit is used for sign, 2 values for missing and end-of-vector + const BYTE byte_width_max = ceil((ceil(log2(abs(max_value) + 1 + 2)) + 1) / 8); if(byte_width_max > byte_width){ byte_width = byte_width_max; } } - else byte_width = ceil(ceil(log2(max + 1)) / 8); + else byte_width = ceil(ceil(log2(max_value + 1)) / 8); + // Select the smallest primitive type (word width) that + // can hold the target data range. if(byte_width == 0) byte_width = 1; else if(byte_width >= 3 && byte_width <= 4) byte_width = 4; else if(byte_width > 4) byte_width = 8; - // Phase 2 - // Here we re-encode values using the smallest possible word-size + // Setup buffers. this->buffer_data.reset(); this->buffer_data.resize(this->buffer_data_uncompressed.size() + 65536); // Is non-negative // Also cannot have missing values - if(min >= 0 && hasMissing == false){ + if(min_value >= 0 && has_special == false){ this->header.data_header.controller.signedness = 0; if(byte_width == 1){ @@ -232,34 +219,40 @@ void DataContainer::reformatInteger(){ exit(1); } } - // Is negative or has missing + // Is negative or Has missing else { this->header.data_header.controller.signedness = true; if(byte_width == 1){ this->header.data_header.controller.type = YON_TYPE_8B; + const SBYTE missing = INT8_MIN; + const SBYTE eov = INT8_MIN + 1; for(U32 j = 0; j < this->header.n_additions; ++j){ - if(udat[j] == 0x80000000) this->buffer_data += (BYTE)0x80; - else if(udat[j] == 0x80000001) this->buffer_data += (BYTE)0x81; + if(dat[j] == bcf_int32_missing) this->buffer_data += missing; + else if(dat[j] == bcf_int32_vector_end) this->buffer_data += eov; else this->buffer_data += (SBYTE)dat[j]; } } else if(byte_width == 2){ this->header.data_header.controller.type = YON_TYPE_16B; + const S16 missing = INT16_MIN; + const S16 eov = INT16_MIN + 1; for(U32 j = 0; j < this->header.n_additions; ++j){ - if(udat[j] == 0x80000000) this->buffer_data += (U16)0x8000; - else if(udat[j] == 0x80000001) this->buffer_data += (U16)0x8001; + if(dat[j] == bcf_int32_missing) this->buffer_data += missing; + else if(dat[j] == bcf_int32_vector_end) this->buffer_data += eov; else this->buffer_data += (S16)dat[j]; } } else if(byte_width == 4){ this->header.data_header.controller.type = YON_TYPE_32B; + const S32 missing = INT32_MIN; + const S32 eov = INT32_MIN + 1; for(U32 j = 0; j < this->header.n_additions; ++j){ - if(udat[j] == 0x80000000) this->buffer_data += (U32)0x80000000; - else if(udat[j] == 0x80000001) this->buffer_data += (U32)0x80000001; + if(dat[j] == bcf_int32_missing) this->buffer_data += missing; + else if(dat[j] == bcf_int32_vector_end) this->buffer_data += eov; else this->buffer_data += (S32)dat[j]; } @@ -268,6 +261,8 @@ void DataContainer::reformatInteger(){ exit(1); } } + assert(this->buffer_data.size() % byte_width == 0); + assert(this->header.n_additions * byte_width == this->buffer_data.size()); memcpy(this->buffer_data_uncompressed.buffer, this->buffer_data.buffer, this->buffer_data.size()); this->buffer_data_uncompressed.n_chars = this->buffer_data.size(); @@ -275,11 +270,11 @@ void DataContainer::reformatInteger(){ this->buffer_data.reset(); } -void DataContainer::reformatStride(){ +void DataContainer::ReformatStride(){ if(this->buffer_strides_uncompressed.size() == 0) return; - if(this->header.data_header.hasMixedStride() == false) + if(this->header.data_header.HasMixedStride() == false) return; // Recode integer types @@ -351,79 +346,28 @@ void DataContainer::reformatStride(){ this->buffer_strides.reset(); } -const U32 DataContainer::getObjectSize(void) const{ +U32 DataContainer::GetObjectSize(void) const{ // In case data is encrypted if(this->header.data_header.controller.encryption != YON_ENCRYPTION_NONE) return(this->buffer_data.size()); U32 total_size = this->buffer_data.size(); - if(this->header.data_header.hasMixedStride()) + if(this->header.data_header.HasMixedStride()) total_size += this->buffer_strides.size(); return(total_size); } -const U64 DataContainer::getObjectSizeUncompressed(void) const{ +U64 DataContainer::GetObjectSizeUncompressed(void) const{ U64 total_size = this->buffer_data_uncompressed.size(); - if(this->header.data_header.hasMixedStride()) + if(this->header.data_header.HasMixedStride()) total_size += this->buffer_strides_uncompressed.size(); return(total_size); } -void DataContainer::deltaEncode(){ - if(this->size() == 0) - return; - - // Recode integer types only - if(!(this->header.data_header.controller.type == YON_TYPE_32B && this->header.data_header.controller.signedness == 1)){ - return; - } - - if(this->header.data_header.controller.uniform == true) - return; - - // At this point all integers are S32 - const S32* const dat = reinterpret_cast(this->buffer_data_uncompressed.buffer); - - // check for uniformity except first - if(this->header.n_additions > 1){ - bool is_uniform_delta = true; - const S32 test_diff = dat[1] - dat[0]; - for(U32 i = 2; i < this->header.n_additions; ++i){ - if(dat[i] - dat[i - 1] != test_diff){ - is_uniform_delta = false; - break; - } - } - - if(is_uniform_delta){ - this->header.n_entries = 1; - this->header.n_additions = 1; - // Data pointers are updated in case there is no reformatting - // see StreamContainer::reformat() - this->buffer_data_uncompressed.n_chars = sizeof(S32); - this->header.data_header.uLength = sizeof(S32); - this->header.data_header.cLength = sizeof(S32); - this->header.data_header.controller.uniform = true; - this->header.data_header.controller.mixedStride = false; - this->header.data_header.controller.encoder = YON_ENCODE_NONE; - return; - } - } - - this->buffer_data += dat[0]; - for(U32 j = 1; j < this->header.n_additions; ++j){ - this->buffer_data += dat[j] - dat[j-1]; - } - memcpy(this->buffer_data_uncompressed.data(), - this->buffer_data.data(), - this->buffer_data.size()); - -} - -void DataContainer::updateContainer(bool reformat){ - // If the data container has entries in it but has +void DataContainer::UpdateContainer(bool reformat_data, bool reformat_stride){ + // If the data container Has entries in it but Has // no actual data then it is a BOOLEAN if(this->header.n_entries && this->buffer_data_uncompressed.size() == 0){ this->header.reset(); @@ -444,21 +388,280 @@ void DataContainer::updateContainer(bool reformat){ // Check if stream is uniform in content if(this->header.data_header.controller.type != YON_TYPE_STRUCT){ - this->checkUniformity(); + this->CheckUniformity(); // Reformat stream to use as small word size as possible - if(reformat) this->reformatInteger(); + if(reformat_data) this->ReformatInteger(); } // Set uncompressed length this->header.data_header.uLength = this->buffer_data_uncompressed.size(); // If we have mixed striding - if(this->header.data_header.hasMixedStride()){ + if(this->header.data_header.HasMixedStride()){ // Reformat stream to use as small word size as possible - if(reformat) this->reformatStride(); + if(reformat_stride) this->ReformatStride(); this->header.stride_header.uLength = this->buffer_strides_uncompressed.size(); } } +void DataContainer::AddStride(const U32 value){ + // If this is the first stride set + if(this->header.n_strides == 0){ + this->header.stride_header.controller.type = YON_TYPE_32B; + this->header.stride_header.controller.signedness = false; + this->SetStrideSize(value); + } + + // Check if there are different strides + if(!this->CheckStrideSize(value)){ + this->TriggerMixedStride(); + } + + // Add value + this->buffer_strides_uncompressed += (U32)value; + ++this->header.n_strides; +} + +bool DataContainer::Add(const BYTE& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_32B); + this->header.data_header.controller.signedness = false; + } + if(!this->CheckInteger()) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added BYTE" << std::endl; + exit(1); + return false; + } + this->buffer_data_uncompressed += (S32)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const U16& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_32B); + this->header.data_header.controller.signedness = false; + } + if(!this->CheckInteger()) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added U16" << std::endl; + exit(1); + return false; + } + this->buffer_data_uncompressed += (S32)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const U32& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_32B); + this->header.data_header.controller.signedness = false; + } + if(!this->CheckInteger()){ + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added U32" << std::endl; + exit(1); + return false; + } + this->buffer_data_uncompressed += (S32)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const SBYTE& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_32B); + this->header.data_header.controller.signedness = false; + } + + if(!this->CheckInteger()) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added SBYTE" << std::endl; + exit(1); + return false; + } + + if(value == bcf_int8_vector_end){ + this->buffer_data_uncompressed += (S32)bcf_int32_vector_end; + ++this->header.n_additions; + //std::cerr << "value is int8eov" << std::endl; + return(true); + } + + if(value == bcf_int8_missing){ + this->buffer_data_uncompressed += (S32)bcf_int32_missing; + ++this->header.n_additions; + //std::cerr << "value is int8miss" << std::endl; + return(true); + } + + this->buffer_data_uncompressed += (S32)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const S16& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_32B); + this->header.data_header.controller.signedness = false; + } + if(!this->CheckInteger()) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added S16" << std::endl; + exit(1); + return false; + } + + if(value == bcf_int16_vector_end){ + this->buffer_data_uncompressed += (S32)bcf_int32_vector_end; + ++this->header.n_additions; + //std::cerr << "value is int16eov" << std::endl; + return(true); + } + + if(value == bcf_int16_missing){ + this->buffer_data_uncompressed += (S32)bcf_int32_missing; + ++this->header.n_additions; + //std::cerr << "value is int16miss" << std::endl; + return(true); + } + + this->buffer_data_uncompressed += (S32)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const S32& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_32B); + this->header.data_header.controller.signedness = false; + } + if(!this->CheckInteger()) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added S32" << std::endl; + exit(1); + return false; + } + + if(value == bcf_int32_vector_end){ + this->buffer_data_uncompressed += (S32)bcf_int32_vector_end; + ++this->header.n_additions; + //std::cerr << "value is int32eov" << std::endl; + return(true); + } + + if(value == bcf_int32_missing){ + this->buffer_data_uncompressed += (S32)bcf_int32_missing; + ++this->header.n_additions; + //std::cerr << "value is int32miss" << std::endl; + return(true); + } + + this->buffer_data_uncompressed += (S32)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const U64& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_64B); + this->header.data_header.controller.signedness = false; + } + + // Make checks + if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_64B, false)) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added U64" << std::endl; + return false; + } + + this->buffer_data_uncompressed += (U64)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const S64& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_64B); + this->header.data_header.controller.signedness = true; + } + + + // Make checks + if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_64B, true)) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added S64" << std::endl; + return false; + } + + this->buffer_data_uncompressed += (U64)value; + ++this->header.n_additions; + //++this->n_entries; + return(true); +} + +bool DataContainer::Add(const float& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_FLOAT); + this->header.data_header.controller.signedness = true; + } + + // Make checks + if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_FLOAT, true)) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added FLOAT" << std::endl; + return false; + } + + this->buffer_data_uncompressed += (float)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::Add(const double& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_DOUBLE); + this->header.data_header.controller.signedness = true; + } + + // Make checks + if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_DOUBLE, true)) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added DOUBLE" << std::endl; + return false; + } + + this->buffer_data_uncompressed += (double)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::AddCharacter(const char& value){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_CHAR); + this->header.data_header.controller.signedness = true; + } + + // Make checks + if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_CHAR, true)) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added CHAR" << std::endl; + return false; + } + + this->buffer_data_uncompressed += (char)value; + ++this->header.n_additions; + return(true); +} + +bool DataContainer::AddCharacter(const char* const string, const U32 l_string){ + if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ + this->header.data_header.SetType(YON_TYPE_CHAR); + this->header.data_header.controller.signedness = true; + //std::cerr << "triggering: string" << std::endl; + } + + // Make checks + if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_CHAR, true)) { + std::cerr << "Primitive type -> local: " << (int)this->header.data_header.controller.type << " added CHAR" << std::endl; + return false; + } + + this->buffer_data_uncompressed.Add(string, l_string); + this->header.n_additions += l_string; + return(true); +} + } } diff --git a/lib/containers/data_container.h b/lib/containers/data_container.h index 67ef4b7..961cf3f 100644 --- a/lib/containers/data_container.h +++ b/lib/containers/data_container.h @@ -31,7 +31,7 @@ class DataContainer{ * the objects in this container. * @param value Data primitive type */ - inline void setType(const TACHYON_CORE_TYPE value){ this->header.data_header.controller.type = value; } + inline void SetType(const TACHYON_CORE_TYPE value){ this->header.data_header.controller.type = value; } /**< * Set the stride size of this container to some value. @@ -40,7 +40,11 @@ class DataContainer{ * [0,..inf) * @param value Stride size */ - inline void setStrideSize(const S32 value){ this->header.data_header.stride = value; } + inline void SetStrideSize(const S32 value){ this->header.data_header.stride = value; } + + inline TACHYON_CORE_TYPE GetDataPrimitiveType(void) const{ return(TACHYON_CORE_TYPE(this->header.data_header.controller.type)); } + inline TACHYON_CORE_TYPE GetStridePrimitiveType(void) const{ return(TACHYON_CORE_TYPE(this->header.stride_header.controller.type)); } + /**< * Check if the stride size of this container matches the @@ -49,8 +53,8 @@ class DataContainer{ * @param value Stride size to compare against * @return Returns TRUE if they are the same or FALSE otherwise */ - inline const bool checkStrideSize(const S32 value) const{ - if(this->header.data_header.hasMixedStride() == false) + inline bool CheckStrideSize(const S32 value) const{ + if(this->header.data_header.HasMixedStride() == false) return false; return(this->header.data_header.stride == value); @@ -60,7 +64,7 @@ class DataContainer{ * Triggers the necessary switches to set this container * as having mixed strides */ - inline void triggerMixedStride(void){ + inline void TriggerMixedStride(void){ this->header.data_header.stride = -1; this->header.data_header.controller.mixedStride = true; } @@ -79,6 +83,11 @@ class DataContainer{ return(*this); } + // Supportive + inline const U64& GetSizeUncompressed(void) const{ return(this->buffer_data_uncompressed.size()); } + inline const U64& GetSizeCompressed(void) const{ return(this->buffer_data.size()); } + inline const U32& size(void) const{ return(this->header.n_entries); } + /**< * Adds a stride value to the uncompressed buffer. At this * point all stride values added must be of type U32. This @@ -86,186 +95,25 @@ class DataContainer{ * not. * @param value Stride value to add */ - inline void addStride(const U32 value){ - // If this is the first stride set - if(this->header.n_strides == 0){ - this->header.stride_header.controller.type = YON_TYPE_32B; - this->header.stride_header.controller.signedness = false; - this->setStrideSize(value); - } - - // Check if there are different strides - if(!this->checkStrideSize(value)){ - this->triggerMixedStride(); - } - - // Add value - this->buffer_strides_uncompressed += (U32)value; - ++this->header.n_strides; - } - - // Supportive - inline const U64& getSizeUncompressed(void) const{ return(this->buffer_data_uncompressed.size()); } - inline const U64& getSizeCompressed(void) const{ return(this->buffer_data.size()); } - inline const U32& size(void) const{ return(this->header.n_entries); } - - inline bool Add(const BYTE& value){ - if(!this->__checkInteger()) return false; - this->buffer_data_uncompressed += (S32)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const U16& value){ - if(!this->__checkInteger()) return false; - this->buffer_data_uncompressed += (S32)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const U32& value){ - if(!this->__checkInteger()) return false; - this->buffer_data_uncompressed += (S32)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const SBYTE& value){ - if(!this->__checkInteger()) return false; - this->buffer_data_uncompressed += (S32)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const S16& value){ - if(!this->__checkInteger()) return false; - this->buffer_data_uncompressed += (S32)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const S32& value){ - if(!this->__checkInteger()) return false; - this->buffer_data_uncompressed += (S32)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const U64& value){ - if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ - this->header.data_header.setType(YON_TYPE_64B); - this->header.data_header.controller.signedness = false; - } - - // Make checks - if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_64B, false)){ - std::cerr << "Illegal primitive type match u64!" << std::endl; - exit(1); - return false; - } - - this->buffer_data_uncompressed += (U64)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const S64& value){ - if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ - this->header.data_header.setType(YON_TYPE_64B); - this->header.data_header.controller.signedness = true; - } - - - // Make checks - if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_64B, true)){ - std::cerr << "Illegal primitive type match s64!" << std::endl; - exit(1); - return false; - } - - this->buffer_data_uncompressed += (U64)value; - ++this->header.n_additions; - //++this->n_entries; - return(true); - } - - inline bool Add(const float& value){ - if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ - this->header.data_header.setType(YON_TYPE_FLOAT); - this->header.data_header.controller.signedness = true; - } - - // Make checks - if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_FLOAT, true)){ - std::cerr << "Illegal primitive type match float!" << std::endl; - exit(1); - return false; - } - - this->buffer_data_uncompressed += (float)value; - ++this->header.n_additions; - return(true); - } - - inline bool Add(const double& value){ - if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ - this->header.data_header.setType(YON_TYPE_DOUBLE); - this->header.data_header.controller.signedness = true; - } - - // Make checks - if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_DOUBLE, true)){ - std::cerr << "Illegal primitive type match double!" << std::endl; - exit(1); - return false; - } - - this->buffer_data_uncompressed += (double)value; - ++this->header.n_additions; - return(true); - } - - inline bool AddCharacter(const char& value){ - if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ - this->header.data_header.setType(YON_TYPE_CHAR); - this->header.data_header.controller.signedness = true; - } - - // Make checks - if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_CHAR, true)){ - std::cerr << "Illegal primitive type match char!" << std::endl; - exit(1); - return false; - } - - this->buffer_data_uncompressed += (char)value; - ++this->header.n_additions; - return(true); - } - - inline bool AddCharacter(const char* const string, const U32 l_string){ - if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ - this->header.data_header.setType(YON_TYPE_CHAR); - this->header.data_header.controller.signedness = true; - //std::cerr << "triggering: string" << std::endl; - } - - // Make checks - if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_CHAR, true)){ - std::cerr << "Illegal primitive type match string!" << std::endl; - exit(1); - return false; - } - - this->buffer_data_uncompressed.Add(string, l_string); - this->header.n_additions += l_string; - return(true); - } + void AddStride(const U32 value); + + bool Add(const BYTE& value); + bool Add(const U16& value); + bool Add(const U32& value); + bool Add(const SBYTE& value); + bool Add(const S16& value); + bool Add(const S32& value); + bool Add(const U64& value); + bool Add(const S64& value); + bool Add(const float& value); + bool Add(const double& value); + bool AddCharacter(const char& value); + bool AddCharacter(const char* const string, const U32 l_string); // Aliases - inline bool AddString(const char* const string, const U32 l_string){ return(this->AddCharacter(string, l_string)); } - inline bool AddString(const std::string& string){ return(this->AddCharacter(&string[0], string.size())); } - inline bool AddCharacter(const std::string& string){ return(this->AddCharacter(&string[0], string.size())); } - inline bool Add(const std::string& string){ return(this->AddCharacter(&string[0], string.size())); } + bool AddString(const char* const string, const U32 l_string){ return(this->AddCharacter(string, l_string)); } + bool AddString(const std::string& string){ return(this->AddCharacter(&string[0], string.size())); } + bool AddCharacter(const std::string& string){ return(this->AddCharacter(&string[0], string.size())); } + bool Add(const std::string& string){ return(this->AddCharacter(&string[0], string.size())); } /**< * @@ -295,7 +143,7 @@ class DataContainer{ * data and, if set, the uncompressed strides data. * CRC32 checksums are stored in the header */ - void generateCRC(void); + void GenerateMd5(void); /**< * @@ -306,16 +154,16 @@ class DataContainer{ * 3: Compressed strides data * * @param target Target buffer stream - * @return Returns TRUE if the CRC checksums are identical or FALSE otherwise + * @return Returns TRUE if the MD5 checksums are identical or FALSE otherwise */ - bool checkCRC(int target = 0); + bool CheckMd5(int target = 0); /**< * Checks if the current data is uniform given the provided * stride size * @return Returns TRUE if the data is uniform or FALSE otherwise */ - bool checkUniformity(void); + bool CheckUniformity(void); /**< * This function is called during import to shrink each @@ -323,27 +171,27 @@ class DataContainer{ * At this stage all integer values in the stream is of * type S32. No other values can be shrunk */ - void reformatInteger(void); + void ReformatInteger(void); /**< * This function is caled during import to shrink each * stride size element to the smallest possible primitive * type to describe it without losing precision. */ - void reformatStride(void); + void ReformatStride(void); /**< * Utility function that calculates the space this * object would take on disk if written out * @return Total size in bytes */ - const U32 getObjectSize(void) const; + U32 GetObjectSize(void) const; /**< * * @return */ - const U64 getObjectSizeUncompressed(void) const; + U64 GetObjectSizeUncompressed(void) const; /**< @brief Update base container header data and evaluate output byte streams * Internal use only (import): Collectively updates base @@ -355,27 +203,18 @@ class DataContainer{ * @param container Data container * @param reormat Reformat boolean */ - void updateContainer(bool reformat = true); - - /**< - * Currently unused - */ - void deltaEncode(void); - - inline const TACHYON_CORE_TYPE getDataPrimitiveType(void) const{ return(TACHYON_CORE_TYPE(this->header.data_header.controller.type)); } - inline const TACHYON_CORE_TYPE getStridePrimitiveType(void) const{ return(TACHYON_CORE_TYPE(this->header.stride_header.controller.type)); } + void UpdateContainer(bool reformat_data = true, bool reformat_stride = true); private: - inline bool __checkInteger(void){ + inline bool CheckInteger(void){ if(this->header.data_header.controller.encoder == YON_ENCODE_NONE && this->header.n_entries == 0){ - this->header.data_header.setType(YON_TYPE_32B); + this->header.data_header.SetType(YON_TYPE_32B); this->header.data_header.controller.signedness = true; } // Make checks if(!this->header.data_header.controller.compareTypeSign(YON_TYPE_32B, true)){ - std::cerr << "Illegal primitive type match integer!" << std::endl; - exit(1); + std::cerr << utility::timestamp("ERROR") << "Illegal primitive type mismatch (integer)!" << std::endl; return false; } return true; @@ -383,7 +222,7 @@ class DataContainer{ friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ stream << entry.buffer_data; - if(entry.header.data_header.hasMixedStride()) + if(entry.header.data_header.HasMixedStride()) stream << entry.buffer_strides; return(stream); @@ -395,7 +234,7 @@ class DataContainer{ stream.read(entry.buffer_data.buffer, entry.header.data_header.cLength); entry.buffer_data.n_chars = entry.header.data_header.cLength; - if(entry.header.data_header.hasMixedStride()){ + if(entry.header.data_header.HasMixedStride()){ entry.buffer_strides.resize(entry.header.stride_header.cLength); stream.read(entry.buffer_strides.buffer, entry.header.stride_header.cLength); entry.buffer_strides.n_chars = entry.header.stride_header.cLength; @@ -409,7 +248,7 @@ class DataContainer{ } public: - header_type header; // usually written elsewhere + header_type header; buffer_type buffer_data; buffer_type buffer_strides; buffer_type buffer_data_uncompressed; diff --git a/lib/containers/format_container.h b/lib/containers/format_container.h index 723b6e9..f8f2f58 100644 --- a/lib/containers/format_container.h +++ b/lib/containers/format_container.h @@ -17,7 +17,7 @@ namespace containers{ */ template class FormatContainer : public FormatContainerInterface{ -private: +public: typedef FormatContainer self_type; typedef PrimitiveGroupContainer value_type; typedef value_type& reference; @@ -31,46 +31,15 @@ class FormatContainer : public FormatContainerInterface{ typedef MetaContainer meta_container_type; typedef StrideContainer stride_container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: FormatContainer(); FormatContainer(const data_container_type& container, const U64 n_samples); FormatContainer(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches, const U64 n_samples); // use when balancing ~FormatContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__containers[position]); } inline const_reference at(const size_type& position) const{ return(this->__containers[position]); } @@ -84,7 +53,7 @@ class FormatContainer : public FormatContainerInterface{ inline const_reference back(void) const{ return(this->__containers[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } // Iterator @@ -97,22 +66,34 @@ class FormatContainer : public FormatContainerInterface{ // Type-specific inline std::ostream& to_vcf_string(std::ostream& stream, const U32 position, const U64 sample) const{ - utility::to_vcf_string(stream, this->at(position).at(sample)); + //utility::to_vcf_string(stream, this->at(position).at(sample).data(), this->at(position).at(sample).size()); return(stream); } inline io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const{ - utility::to_vcf_string(buffer, this->at(position).at(sample)); + utility::to_vcf_string(buffer, this->at(position).at(sample).data(), this->at(position).at(sample).size()); return(buffer); } inline io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const{ - utility::to_json_string(buffer, this->at(position).at(sample)); + //utility::to_json_string(buffer, this->at(position).at(sample)); return(buffer); } - inline const bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } - inline const bool emptyPosition(const U32& position, const U64& sample) const{ return(this->at(position).at(sample).empty()); } + bcf1_t* UpdateHtslibVcfRecord(const uint32_t position, bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + if(this->primitive_type == YON_TYPE_8B || this->primitive_type == YON_TYPE_16B || this->primitive_type == YON_TYPE_32B || this->primitive_type == YON_TYPE_64B){ + return(this->at(position).UpdateHtslibVcfRecordFormatInt32(rec, hdr, tag)); + } else if(this->primitive_type == YON_TYPE_FLOAT || this->primitive_type == YON_TYPE_DOUBLE){ + return(this->at(position).UpdateHtslibVcfRecordFormatFloat(rec, hdr, tag)); + } else { + std::cerr << "illegal type: " << (int)this->primitive_type << std::endl; + exit(1); + } + return rec; + } + + inline bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } + inline bool emptyPosition(const U32& position, const U64& sample) const{ return(this->at(position).at(sample).empty()); } private: /**< @@ -182,47 +163,63 @@ FormatContainer::FormatContainer(const data_container_type& data_co if(data_container.buffer_data_uncompressed.size() == 0) return; - if(data_container.header.data_header.hasMixedStride()){ - if(data_container.header.data_header.isSigned()){ - switch(data_container.header.data_header.getPrimitiveType()){ + if(data_container.header.data_header.HasMixedStride()){ + if(data_container.header.data_header.IsSigned()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } else { - switch(data_container.header.data_header.getPrimitiveType()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } } else { - if(data_container.header.data_header.isSigned()){ - switch(data_container.header.data_header.getPrimitiveType()){ + if(data_container.header.data_header.IsSigned()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } else { - switch(data_container.header.data_header.getPrimitiveType()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } @@ -230,14 +227,15 @@ FormatContainer::FormatContainer(const data_container_type& data_co } template -FormatContainer::FormatContainer(const data_container_type& container, const U64 n_samples) : +FormatContainer::FormatContainer(const data_container_type& container, + const U64 n_samples) : __containers(nullptr) { if(container.buffer_data_uncompressed.size() == 0) return; if(container.header.data_header.controller.mixedStride){ - if(container.header.data_header.isSigned()){ + if(container.header.data_header.IsSigned()){ switch(container.header.data_header.controller.type){ case(YON_TYPE_8B): (this->__setup(container, n_samples)); break; case(YON_TYPE_16B): (this->__setup(container, n_samples)); break; @@ -248,7 +246,7 @@ FormatContainer::FormatContainer(const data_container_type& contain default: std::cerr << "Disallowed type: " << (int)container.header.data_header.controller.type << std::endl; return; } } else { - switch(container.header.data_header.getPrimitiveType()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container, n_samples)); break; case(YON_TYPE_16B): (this->__setup(container, n_samples)); break; case(YON_TYPE_32B): (this->__setup(container, n_samples)); break; @@ -259,24 +257,24 @@ FormatContainer::FormatContainer(const data_container_type& contain } } } else { - if(container.header.data_header.isSigned()){ + if(container.header.data_header.IsSigned()){ switch(container.header.data_header.controller.type){ - case(YON_TYPE_8B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_16B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_32B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_64B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_FLOAT): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_DOUBLE): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; + case(YON_TYPE_8B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_16B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_32B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_64B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_FLOAT): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_DOUBLE): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; default: std::cerr << "Disallowed type: " << (int)container.header.data_header.controller.type << std::endl; return; } } else { - switch(container.header.data_header.getPrimitiveType()){ - case(YON_TYPE_8B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_16B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_32B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_64B): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_FLOAT): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; - case(YON_TYPE_DOUBLE): (this->__setup(container, n_samples, container.header.data_header.getStride())); break; + switch(container.header.data_header.GetPrimitiveType()){ + case(YON_TYPE_8B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_16B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_32B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_64B): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_FLOAT): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; + case(YON_TYPE_DOUBLE): (this->__setup(container, n_samples, container.header.data_header.GetStride())); break; default: std::cerr << "Disallowed type: " << (int)container.header.data_header.controller.type << std::endl; return; } } @@ -293,18 +291,19 @@ FormatContainer::~FormatContainer(){ template template -void FormatContainer::__setup(const data_container_type& container, const U64& n_samples){ +void FormatContainer::__setup(const data_container_type& container, + const U64& n_samples) +{ if(container.buffer_strides_uncompressed.size() == 0) return; - // Todo: there's no guaranteed that this is correct - this->n_entries = container.buffer_data_uncompressed.size() / sizeof(actual_primitive) / n_samples; + this->n_capacity = container.buffer_data_uncompressed.size() / sizeof(actual_primitive) / n_samples; + this->n_entries = 0; - std::cerr << "entries here: " << this->n_entries << std::endl; - if(this->n_entries == 0) + if(this->n_capacity == 0) return; - this->__containers = static_cast(::operator new[](this->n_entries*sizeof(value_type))); + this->__containers = static_cast(::operator new[](this->n_capacity*sizeof(value_type))); stride_container_type strides(container); U32 current_offset = 0; @@ -312,60 +311,74 @@ void FormatContainer::__setup(const data_container_type& container, //std::cerr << current_offset << '/' << container.buffer_data_uncompressed.size() << '\t' << (this->*func)(container.buffer_strides_uncompressed, i) << std::endl; new( &this->__containers[i] ) value_type( container, current_offset, n_samples, strides[i] ); current_offset += strides[i] * sizeof(actual_primitive) * n_samples; + ++this->n_entries; } assert(current_offset == container.buffer_data_uncompressed.size()); } template template -void FormatContainer::__setupBalanced(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches, const U64& n_samples){ - this->n_entries = meta_container.size(); - if(this->n_entries == 0) - return; +void FormatContainer::__setupBalanced(const data_container_type& data_container, + const meta_container_type& meta_container, + const std::vector& pattern_matches, + const U64& n_samples) +{ + this->n_entries = meta_container.size(); + if(this->n_entries == 0) + return; - this->__containers = static_cast(::operator new[](this->n_entries*sizeof(value_type))); - stride_container_type strides(data_container); + assert(data_container.GetSizeUncompressed() % sizeof(actual_primitive) == 0); - U32 current_offset = 0; - U32 strides_offset = 0; - for(U32 i = 0; i < this->size(); ++i){ - // There are no FORMAT fields - if(meta_container[i].getFormatPatternID() == -1){ - new( &this->__containers[i] ) value_type( ); - } - // If pattern matches - else if(pattern_matches[meta_container[i].getFormatPatternID()]){ - new( &this->__containers[i] ) value_type( data_container, current_offset, n_samples, strides[strides_offset] ); - current_offset += strides[strides_offset] * sizeof(actual_primitive) * n_samples; - ++strides_offset; - } - // Otherwise place an empty - else { - new( &this->__containers[i] ) value_type( ); - } + this->__containers = static_cast(::operator new[](this->n_entries*sizeof(value_type))); + stride_container_type strides(data_container); + + U32 current_offset = 0; + U32 strides_offset = 0; + for(U32 i = 0; i < this->size(); ++i){ + // There are no FORMAT fields set empty. + if(meta_container[i].GetFormatPatternId() == -1){ + new( &this->__containers[i] ) value_type( ); + } + // If pattern matches + else if(pattern_matches[meta_container[i].GetFormatPatternId()]){ + new( &this->__containers[i] ) value_type( data_container, current_offset, n_samples, strides[strides_offset] ); + current_offset += strides[strides_offset] * sizeof(actual_primitive) * n_samples; + ++strides_offset; } - assert(current_offset == data_container.buffer_data_uncompressed.size()); + // Otherwise place an empty + else { + new( &this->__containers[i] ) value_type( ); + } + } + assert(current_offset == data_container.buffer_data_uncompressed.size()); } template template -void FormatContainer::__setupBalanced(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches, const U64& n_samples, const U32 stride_size){ +void FormatContainer::__setupBalanced(const data_container_type& data_container, + const meta_container_type& meta_container, + const std::vector& pattern_matches, + const U64& n_samples, + const U32 stride_size) +{ this->n_entries = meta_container.size(); if(this->n_entries == 0) return; + assert(data_container.GetSizeUncompressed() % sizeof(actual_primitive) == 0); + this->__containers = static_cast(::operator new[](this->n_entries*sizeof(value_type))); U32 current_offset = 0; // Case 1: if data is uniform - if(data_container.header.data_header.isUniform()){ + if(data_container.header.data_header.IsUniform()){ for(U32 i = 0; i < this->size(); ++i){ // There are no FORMAT fields - if(meta_container[i].getFormatPatternID() == -1){ + if(meta_container[i].GetFormatPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getFormatPatternID()]){ + else if(pattern_matches[meta_container[i].GetFormatPatternId()]){ new( &this->__containers[i] ) value_type( data_container, 0, n_samples, stride_size ); } // Otherwise place an empty @@ -380,7 +393,7 @@ void FormatContainer::__setupBalanced(const data_container_type& da else { for(U32 i = 0; i < this->size(); ++i){ // If pattern matches - if(pattern_matches[meta_container[i].getFormatPatternID()]){ + if(pattern_matches[meta_container[i].GetFormatPatternId()]){ new( &this->__containers[i] ) value_type( data_container, current_offset, n_samples, stride_size ); current_offset += stride_size * sizeof(actual_primitive) * n_samples; } @@ -395,7 +408,10 @@ void FormatContainer::__setupBalanced(const data_container_type& da template template -void FormatContainer::__setup(const data_container_type& container, const U64& n_samples, const U32 stride_size){ +void FormatContainer::__setup(const data_container_type& container, + const U64& n_samples, + const U32 stride_size) +{ this->n_entries = container.buffer_data_uncompressed.size() / sizeof(actual_primitive) / n_samples / stride_size; if(this->n_entries == 0) @@ -405,7 +421,7 @@ void FormatContainer::__setup(const data_container_type& container, U32 current_offset = 0; // Case 1: data is uniform -> give all samples the same value - if(container.header.data_header.isUniform()){ + if(container.header.data_header.IsUniform()){ for(U32 i = 0; i < this->size(); ++i) new( &this->__containers[i] ) value_type( container, current_offset, n_samples, stride_size ); diff --git a/lib/containers/format_container_interface.h b/lib/containers/format_container_interface.h index 00d0c44..a878498 100644 --- a/lib/containers/format_container_interface.h +++ b/lib/containers/format_container_interface.h @@ -10,23 +10,26 @@ class FormatContainerInterface{ typedef std::size_t size_type; public: - FormatContainerInterface() : primitive_type( YON_TYPE_32B), n_entries(0){} - FormatContainerInterface(const size_t n_entries) : primitive_type(YON_TYPE_32B), n_entries(n_entries){} + FormatContainerInterface() : primitive_type( YON_TYPE_32B), n_entries(0), n_capacity(0){} + FormatContainerInterface(const size_t n_entries) : primitive_type(YON_TYPE_32B), n_entries(n_entries), n_capacity(0){} virtual ~FormatContainerInterface(){} // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } virtual std::ostream& to_vcf_string(std::ostream& stream, const U32 position, const U64 sample_number) const =0; virtual io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const =0; virtual io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const =0; - virtual const bool emptyPosition(const U32& position) const =0; - virtual const bool emptyPosition(const U32& position, const U64& sample) const =0; + virtual bool emptyPosition(const U32& position) const =0; + virtual bool emptyPosition(const U32& position, const U64& sample) const =0; + + virtual bcf1_t* UpdateHtslibVcfRecord(const uint32_t position, bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const =0; protected: TACHYON_CORE_TYPE primitive_type; size_t n_entries; + size_t n_capacity; }; } diff --git a/lib/containers/format_container_string.cpp b/lib/containers/format_container_string.cpp index 95ed21e..b5df2de 100644 --- a/lib/containers/format_container_string.cpp +++ b/lib/containers/format_container_string.cpp @@ -18,7 +18,7 @@ FormatContainer::FormatContainer(const data_container_type& data_co if(data_container.buffer_data_uncompressed.size() == 0) return; - if(data_container.header.data_header.hasMixedStride()){ + if(data_container.header.data_header.HasMixedStride()){ this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples); } else { this->__setupBalanced(data_container, meta_container, pattern_matches, n_samples, data_container.header.data_header.stride); @@ -34,7 +34,7 @@ FormatContainer::FormatContainer(const data_container_type& contain if(container.header.data_header.controller.mixedStride){ this->__setup(container, n_samples); } else { - this->__setup(container, n_samples, container.header.data_header.getStride()); + this->__setup(container, n_samples, container.header.data_header.GetStride()); } } @@ -49,17 +49,25 @@ void FormatContainer::__setup(const data_container_type& container, if(container.buffer_strides_uncompressed.size() == 0) return; - if(this->n_entries == 0) + this->n_capacity = container.buffer_data_uncompressed.size() / n_samples; + this->n_entries = 0; + + if(this->n_capacity == 0) return; - this->__containers = static_cast(::operator new[](this->n_entries*sizeof(value_type))); + this->__containers = static_cast(::operator new[](this->n_capacity*sizeof(value_type))); stride_container_type strides(container); U32 current_offset = 0; - for(U32 i = 0; i < this->n_entries; ++i){ + U32 current_position = 0; + while(true){ //std::cerr << current_offset << '/' << container.buffer_data_uncompressed.size() << '\t' << (this->*func)(container.buffer_strides_uncompressed, i) << std::endl; - new( &this->__containers[i] ) value_type( container, current_offset, n_samples, strides[i] ); - current_offset += strides[i] * n_samples; + new( &this->__containers[current_position] ) value_type( container, current_offset, n_samples, strides[current_position] ); + current_offset += strides[current_position] * n_samples; + ++this->n_entries; + if(current_offset == container.buffer_data_uncompressed.size()) break; + assert(current_offset <= container.buffer_data_uncompressed.size()); + ++current_position; } assert(current_offset == container.buffer_data_uncompressed.size()); } @@ -76,11 +84,11 @@ void FormatContainer::__setupBalanced(const data_container_type& da U32 strides_offset = 0; for(U32 i = 0; i < this->n_entries; ++i){ // There are no INFO fields - if(meta_container[i].getInfoPatternID() == -1){ + if(meta_container[i].GetInfoPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getFormatPatternID()]){ + else if(pattern_matches[meta_container[i].GetFormatPatternId()]){ new( &this->__containers[i] ) value_type( data_container, current_offset, n_samples, strides[strides_offset] ); current_offset += strides[strides_offset] * n_samples; ++strides_offset; @@ -102,14 +110,14 @@ void FormatContainer::__setupBalanced(const data_container_type& da U32 current_offset = 0; // Case 1: if data is uniform - if(data_container.header.data_header.isUniform()){ + if(data_container.header.data_header.IsUniform()){ for(U32 i = 0; i < this->n_entries; ++i){ // There are no INFO fields - if(meta_container[i].getInfoPatternID() == -1){ + if(meta_container[i].GetInfoPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getFormatPatternID()]){ + else if(pattern_matches[meta_container[i].GetFormatPatternId()]){ new( &this->__containers[i] ) value_type( data_container, 0, n_samples, stride_size ); } // Otherwise place an empty @@ -124,11 +132,11 @@ void FormatContainer::__setupBalanced(const data_container_type& da else { for(U32 i = 0; i < this->n_entries; ++i){ // There are no INFO fields - if(meta_container[i].getInfoPatternID() == -1){ + if(meta_container[i].GetInfoPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getFormatPatternID()]){ + else if(pattern_matches[meta_container[i].GetFormatPatternId()]){ new( &this->__containers[i] ) value_type( data_container, current_offset, n_samples, stride_size ); current_offset += stride_size * n_samples; } @@ -151,7 +159,7 @@ void FormatContainer::__setup(const data_container_type& container, U32 current_offset = 0; // Case 1: data is uniform -> give all samples the same value - if(container.header.data_header.isUniform()){ + if(container.header.data_header.IsUniform()){ for(U32 i = 0; i < this->n_entries; ++i) new( &this->__containers[i] ) value_type( container, current_offset, n_samples, stride_size ); diff --git a/lib/containers/format_container_string.h b/lib/containers/format_container_string.h index a329ad9..6854b76 100644 --- a/lib/containers/format_container_string.h +++ b/lib/containers/format_container_string.h @@ -17,7 +17,7 @@ namespace containers{ */ template <> class FormatContainer : public FormatContainerInterface{ -private: +public: typedef FormatContainer self_type; typedef PrimitiveGroupContainer value_type; typedef value_type& reference; @@ -31,46 +31,15 @@ class FormatContainer : public FormatContainerInterface{ typedef MetaContainer meta_container_type; typedef StrideContainer stride_container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: FormatContainer(); FormatContainer(const data_container_type& container, const U64 n_samples); FormatContainer(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches, const U64 n_samples); // use when balancing ~FormatContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__containers[position]); } inline const_reference at(const size_type& position) const{ return(this->__containers[position]); } @@ -84,7 +53,7 @@ class FormatContainer : public FormatContainerInterface{ inline const_reference back(void) const{ return(this->__containers[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } // Iterator @@ -96,11 +65,24 @@ class FormatContainer : public FormatContainerInterface{ inline const_iterator cend() const{ return const_iterator(&this->__containers[this->n_entries]); } // Type-specific - inline std::ostream& to_vcf_string(std::ostream& stream, const U32 position, const U64 sample) const{ utility::to_vcf_string(stream, this->at(position).at(sample)); return(stream); } - inline io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const{ buffer += this->at(position).at(sample); return(buffer); } - inline io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const{ buffer += this->at(position).at(sample); return(buffer); } - inline const bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } - inline const bool emptyPosition(const U32& position, const U64& sample) const{ return(this->at(position).at(sample).empty()); } + inline std::ostream& to_vcf_string(std::ostream& stream, + const U32 position, + const U64 sample) const + { + //utility::to_vcf_string(stream, this->at(position).at(sample).data_); + //this->at(position).at(sample).data_; + assert(2 == 1); + return(stream); + } + + bcf1_t* UpdateHtslibVcfRecord(const uint32_t position, bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + return(this->at(position).UpdateHtslibVcfRecordFormatString(rec, hdr, tag)); + } + + inline io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const{ buffer += this->at(position).at(sample).data_; return(buffer); } + inline io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const U32 position, const U64 sample) const{ buffer += this->at(position).at(sample).data_; return(buffer); } + inline bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } + inline bool emptyPosition(const U32& position, const U64& sample) const{ return(this->at(position).at(sample).empty()); } private: /**< diff --git a/lib/containers/genotype_container.cpp b/lib/containers/genotype_container.cpp index a5e94de..3bdbadb 100644 --- a/lib/containers/genotype_container.cpp +++ b/lib/containers/genotype_container.cpp @@ -13,26 +13,32 @@ GenotypeContainer::GenotypeContainer(const block_type& block, const MetaContaine { // Todo: if anything is uniform // Support - const bool uniform_stride = block.gt_support_data_container.header.data_header.isUniform(); - PrimitiveContainer lengths(block.gt_support_data_container); // n_runs / objects size - - U64 offset_rle8 = 0; const char* const rle8 = block.gt_rle8_container.buffer_data_uncompressed.data(); - U64 offset_rle16 = 0; const char* const rle16 = block.gt_rle16_container.buffer_data_uncompressed.data(); - U64 offset_rle32 = 0; const char* const rle32 = block.gt_rle32_container.buffer_data_uncompressed.data(); - U64 offset_rle64 = 0; const char* const rle64 = block.gt_rle64_container.buffer_data_uncompressed.data(); - U64 offset_simple8 = 0; const char* const simple8 = block.gt_simple8_container.buffer_data_uncompressed.data(); - U64 offset_simple16 = 0; const char* const simple16 = block.gt_simple16_container.buffer_data_uncompressed.data(); - U64 offset_simple32 = 0; const char* const simple32 = block.gt_simple32_container.buffer_data_uncompressed.data(); - U64 offset_simple64 = 0; const char* const simple64 = block.gt_simple64_container.buffer_data_uncompressed.data(); - - assert(block.gt_rle8_container.buffer_data_uncompressed.size() % sizeof(BYTE) == 0); - assert(block.gt_rle16_container.buffer_data_uncompressed.size() % sizeof(U16) == 0); - assert(block.gt_rle32_container.buffer_data_uncompressed.size() % sizeof(U32) == 0); - assert(block.gt_rle64_container.buffer_data_uncompressed.size() % sizeof(U64) == 0); - assert(block.gt_simple8_container.buffer_data_uncompressed.size() % sizeof(BYTE) == 0); - assert(block.gt_simple16_container.buffer_data_uncompressed.size() % sizeof(U16) == 0); - assert(block.gt_simple32_container.buffer_data_uncompressed.size() % sizeof(U32) == 0); - assert(block.gt_simple64_container.buffer_data_uncompressed.size() % sizeof(U64) == 0); + const bool uniform_stride = block.base_containers[YON_BLK_GT_SUPPORT].header.data_header.IsUniform(); + PrimitiveContainer lengths(block.base_containers[YON_BLK_GT_SUPPORT]); // n_runs / objects size + + U64 offset_rle8 = 0; const char* const rle8 = block.base_containers[YON_BLK_GT_INT8].buffer_data_uncompressed.data(); + U64 offset_rle16 = 0; const char* const rle16 = block.base_containers[YON_BLK_GT_INT16].buffer_data_uncompressed.data(); + U64 offset_rle32 = 0; const char* const rle32 = block.base_containers[YON_BLK_GT_INT32].buffer_data_uncompressed.data(); + U64 offset_rle64 = 0; const char* const rle64 = block.base_containers[YON_BLK_GT_INT64].buffer_data_uncompressed.data(); + + U64 offset_simple8 = 0; const char* const simple8 = block.base_containers[YON_BLK_GT_S_INT8].buffer_data_uncompressed.data(); + U64 offset_simple16 = 0; const char* const simple16 = block.base_containers[YON_BLK_GT_S_INT16].buffer_data_uncompressed.data(); + U64 offset_simple32 = 0; const char* const simple32 = block.base_containers[YON_BLK_GT_S_INT32].buffer_data_uncompressed.data(); + U64 offset_simple64 = 0; const char* const simple64 = block.base_containers[YON_BLK_GT_S_INT64].buffer_data_uncompressed.data(); + + U64 offset_nploid8 = 0; const char* const nploid8 = block.base_containers[YON_BLK_GT_N_INT8].buffer_data_uncompressed.data(); + U64 offset_nploid16 = 0; const char* const nploid16 = block.base_containers[YON_BLK_GT_N_INT16].buffer_data_uncompressed.data(); + U64 offset_nploid32 = 0; const char* const nploid32 = block.base_containers[YON_BLK_GT_N_INT32].buffer_data_uncompressed.data(); + U64 offset_nploid64 = 0; const char* const nploid64 = block.base_containers[YON_BLK_GT_N_INT64].buffer_data_uncompressed.data(); + + assert(block.base_containers[YON_BLK_GT_INT8].buffer_data_uncompressed.size() % sizeof(BYTE) == 0); + assert(block.base_containers[YON_BLK_GT_INT16].buffer_data_uncompressed.size() % sizeof(U16) == 0); + assert(block.base_containers[YON_BLK_GT_INT32].buffer_data_uncompressed.size() % sizeof(U32) == 0); + assert(block.base_containers[YON_BLK_GT_INT64].buffer_data_uncompressed.size() % sizeof(U64) == 0); + assert(block.base_containers[YON_BLK_GT_S_INT8].buffer_data_uncompressed.size() % sizeof(BYTE) == 0); + assert(block.base_containers[YON_BLK_GT_S_INT16].buffer_data_uncompressed.size() % sizeof(U16) == 0); + assert(block.base_containers[YON_BLK_GT_S_INT32].buffer_data_uncompressed.size() % sizeof(U32) == 0); + assert(block.base_containers[YON_BLK_GT_S_INT64].buffer_data_uncompressed.size() % sizeof(U64) == 0); this->n_entries = meta.size(); this->__iterators = static_cast(::operator new[](this->size() * sizeof(value_type))); @@ -42,19 +48,19 @@ GenotypeContainer::GenotypeContainer(const block_type& block, const MetaContaine if(uniform_stride) incrementor = 0; for(U32 i = 0; i < meta.size(); ++i){ - if(meta[i].hasGT()){ + if(meta[i].HasGT()){ // Case run-length encoding diploid and biallelic and no missing - if(meta[i].getGenotypeEncoding() == TACHYON_GT_ENCODING::YON_GT_RLE_DIPLOID_BIALLELIC){ - if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_BYTE){ + if(meta[i].GetGenotypeEncoding() == TACHYON_GT_ENCODING::YON_GT_RLE_DIPLOID_BIALLELIC){ + if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_BYTE){ new( &this->__iterators[i] ) GenotypeContainerDiploidRLE( &rle8[offset_rle8], lengths[gt_offset], this->__meta_container[i] ); offset_rle8 += lengths[gt_offset]*sizeof(BYTE); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U16){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U16){ new( &this->__iterators[i] ) GenotypeContainerDiploidRLE( &rle16[offset_rle16], lengths[gt_offset], this->__meta_container[i] ); offset_rle16 += lengths[gt_offset]*sizeof(U16); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U32){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U32){ new( &this->__iterators[i] ) GenotypeContainerDiploidRLE( &rle32[offset_rle32], lengths[gt_offset], this->__meta_container[i] ); offset_rle32 += lengths[gt_offset]*sizeof(U32); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U64){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U64){ new( &this->__iterators[i] ) GenotypeContainerDiploidRLE( &rle64[offset_rle64], lengths[gt_offset], this->__meta_container[i] ); offset_rle64 += lengths[gt_offset]*sizeof(U64); } else { @@ -64,17 +70,17 @@ GenotypeContainer::GenotypeContainer(const block_type& block, const MetaContaine } // Case run-length encoding diploid and biallelic/EOV or n-allelic - else if(meta[i].getGenotypeEncoding() == TACHYON_GT_ENCODING::YON_GT_RLE_DIPLOID_NALLELIC) { - if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_BYTE){ + else if(meta[i].GetGenotypeEncoding() == TACHYON_GT_ENCODING::YON_GT_RLE_DIPLOID_NALLELIC) { + if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_BYTE){ new( &this->__iterators[i] ) GenotypeContainerDiploidSimple( &simple8[offset_simple8], lengths[gt_offset], this->__meta_container[i] ); offset_simple8 += lengths[gt_offset]*sizeof(BYTE); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U16){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U16){ new( &this->__iterators[i] ) GenotypeContainerDiploidSimple( &simple16[offset_simple16], lengths[gt_offset], this->__meta_container[i] ); offset_simple16 += lengths[gt_offset]*sizeof(U16); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U32){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U32){ new( &this->__iterators[i] ) GenotypeContainerDiploidSimple( &simple32[offset_simple32], lengths[gt_offset], this->__meta_container[i] ); offset_simple32 += lengths[gt_offset]*sizeof(U32); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U64){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U64){ new( &this->__iterators[i] ) GenotypeContainerDiploidSimple( &simple64[offset_simple64], lengths[gt_offset], this->__meta_container[i] ); offset_simple64 += lengths[gt_offset]*sizeof(U64); } else { @@ -83,17 +89,17 @@ GenotypeContainer::GenotypeContainer(const block_type& block, const MetaContaine } } // Case BCF-style encoding of diploids - else if(meta[i].getGenotypeEncoding() == TACHYON_GT_ENCODING::YON_GT_BCF_DIPLOID) { - if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_BYTE){ + else if(meta[i].GetGenotypeEncoding() == TACHYON_GT_ENCODING::YON_GT_BCF_DIPLOID) { + if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_BYTE){ new( &this->__iterators[i] ) GenotypeContainerDiploidBCF( &simple8[offset_simple8], lengths[gt_offset], this->__meta_container[i] ); offset_simple8 += lengths[gt_offset]*sizeof(BYTE); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U16){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U16){ new( &this->__iterators[i] ) GenotypeContainerDiploidBCF( &simple16[offset_simple16], lengths[gt_offset], this->__meta_container[i] ); offset_simple16 += lengths[gt_offset]*sizeof(U16); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U32){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U32){ new( &this->__iterators[i] ) GenotypeContainerDiploidBCF( &simple32[offset_simple32], lengths[gt_offset], this->__meta_container[i] ); offset_simple32 += lengths[gt_offset]*sizeof(U32); - } else if(meta[i].getGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U64){ + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U64){ new( &this->__iterators[i] ) GenotypeContainerDiploidBCF( &simple64[offset_simple64], lengths[gt_offset], this->__meta_container[i] ); offset_simple64 += lengths[gt_offset]*sizeof(U64); } else { @@ -101,6 +107,25 @@ GenotypeContainer::GenotypeContainer(const block_type& block, const MetaContaine exit(1); } } + // Case RLE-encoding of nploids + else if(meta[i].GetGenotypeEncoding() == TACHYON_GT_ENCODING::YON_GT_RLE_NPLOID) { + if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_BYTE){ + new( &this->__iterators[i] ) GenotypeContainerNploid( &nploid8[offset_nploid8], lengths[gt_offset], this->__meta_container[i] ); + offset_nploid8 += lengths[gt_offset]*(sizeof(BYTE) + this->__meta_container[i].n_base_ploidy*sizeof(BYTE)); + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U16){ + new( &this->__iterators[i] ) GenotypeContainerNploid( &nploid16[offset_nploid16], lengths[gt_offset], this->__meta_container[i] ); + offset_nploid16 += lengths[gt_offset]*(sizeof(U16) + this->__meta_container[i].n_base_ploidy*sizeof(BYTE)); + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U32){ + new( &this->__iterators[i] ) GenotypeContainerNploid( &nploid32[offset_nploid32], lengths[gt_offset], this->__meta_container[i] ); + offset_nploid32 += lengths[gt_offset]*(sizeof(U32) + this->__meta_container[i].n_base_ploidy*sizeof(BYTE)); + } else if(meta[i].GetGenotypeType() == TACHYON_GT_PRIMITIVE_TYPE::YON_GT_U64){ + new( &this->__iterators[i] ) GenotypeContainerNploid( &nploid64[offset_nploid64], lengths[gt_offset], this->__meta_container[i] ); + offset_nploid64 += lengths[gt_offset]*(sizeof(U64) + this->__meta_container[i].n_base_ploidy*sizeof(BYTE)); + } else { + std::cerr << utility::timestamp("ERROR","GT") << "Unknown GT encoding primitive..." << std::endl; + exit(1); + } + } // Case other potential encodings else { std::cerr << utility::timestamp("ERROR","GT") << "Unknown GT encoding family..." << std::endl; @@ -116,14 +141,18 @@ GenotypeContainer::GenotypeContainer(const block_type& block, const MetaContaine } } - assert(offset_rle8 == block.gt_rle8_container.getSizeUncompressed()); - assert(offset_rle16 == block.gt_rle16_container.getSizeUncompressed()); - assert(offset_rle32 == block.gt_rle32_container.getSizeUncompressed()); - assert(offset_rle64 == block.gt_rle64_container.getSizeUncompressed()); - assert(offset_simple8 == block.gt_simple8_container.getSizeUncompressed()); - assert(offset_simple16 == block.gt_simple16_container.getSizeUncompressed()); - assert(offset_simple32 == block.gt_simple32_container.getSizeUncompressed()); - assert(offset_simple64 == block.gt_simple64_container.getSizeUncompressed()); + assert(offset_rle8 == block.base_containers[YON_BLK_GT_INT8].GetSizeUncompressed()); + assert(offset_rle16 == block.base_containers[YON_BLK_GT_INT16].GetSizeUncompressed()); + assert(offset_rle32 == block.base_containers[YON_BLK_GT_INT32].GetSizeUncompressed()); + assert(offset_rle64 == block.base_containers[YON_BLK_GT_INT64].GetSizeUncompressed()); + assert(offset_simple8 == block.base_containers[YON_BLK_GT_S_INT8].GetSizeUncompressed()); + assert(offset_simple16 == block.base_containers[YON_BLK_GT_S_INT16].GetSizeUncompressed()); + assert(offset_simple32 == block.base_containers[YON_BLK_GT_S_INT32].GetSizeUncompressed()); + assert(offset_simple64 == block.base_containers[YON_BLK_GT_S_INT64].GetSizeUncompressed()); + assert(offset_nploid8 == block.base_containers[YON_BLK_GT_N_INT8].GetSizeUncompressed()); + assert(offset_nploid16 == block.base_containers[YON_BLK_GT_N_INT16].GetSizeUncompressed()); + assert(offset_nploid32 == block.base_containers[YON_BLK_GT_N_INT32].GetSizeUncompressed()); + assert(offset_nploid64 == block.base_containers[YON_BLK_GT_N_INT64].GetSizeUncompressed()); } GenotypeContainer::~GenotypeContainer(){ diff --git a/lib/containers/genotype_container.h b/lib/containers/genotype_container.h index 6c8511b..34348ac 100644 --- a/lib/containers/genotype_container.h +++ b/lib/containers/genotype_container.h @@ -4,6 +4,7 @@ #include "genotype_container_diploid_bcf.h" #include "genotype_container_diploid_rle.h" #include "genotype_container_diploid_simple.h" +#include "genotype_container_nploid.h" #include "meta_container.h" #include "variant_block.h" @@ -14,7 +15,8 @@ namespace containers{ template class GenotypeContainerDiploidRLE; template class GenotypeContainerDiploidSimple; -class GenotypeContainer{ + +class GenotypeContainer { private: typedef GenotypeContainer self_type; typedef GenotypeContainerInterface value_type; @@ -27,49 +29,18 @@ class GenotypeContainer{ typedef MetaContainer meta_container_type; typedef tachyon::core::MetaEntry meta_type; typedef io::BasicBuffer buffer_type; - typedef containers::GenotypeSummary gt_summary_type; + typedef yon_gt_summary gt_summary_type; typedef VariantBlock block_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: GenotypeContainer(const block_type& block, const MetaContainer& meta); ~GenotypeContainer(); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } // Element access @@ -81,25 +52,25 @@ class GenotypeContainer{ inline const_reference at(const U32& position) const{ return(this->__iterators[position]); } // Advanced getters - inline GenotypeContainerDiploidSimple* getDiploidSimpleByte(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline GenotypeContainerDiploidSimple* getDiploidSimpleU16(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline GenotypeContainerDiploidSimple* getDiploidSimpleU32(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline GenotypeContainerDiploidSimple* getDiploidSimpleU64(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline GenotypeContainerDiploidRLE* getDiploidRLEByte(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline GenotypeContainerDiploidRLE* getDiploidRLEU16(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline GenotypeContainerDiploidRLE* getDiploidRLEU32(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline GenotypeContainerDiploidRLE* getDiploidRLEU64(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidSimple* getDiploidSimpleByte(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidSimple* getDiploidSimpleU16(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidSimple* getDiploidSimpleU32(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidSimple* getDiploidSimpleU64(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidRLE* getDiploidRLEByte(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidRLE* getDiploidRLEU16(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidRLE* getDiploidRLEU32(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } - inline const GenotypeContainerDiploidRLE* getDiploidRLEU64(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidSimple* GetDiploidSimpleByte(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidSimple* GetDiploidSimpleU16(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidSimple* GetDiploidSimpleU32(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidSimple* GetDiploidSimpleU64(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidRLE* GetDiploidRLEByte(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidRLE* GetDiploidRLEU16(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidRLE* GetDiploidRLEU32(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline GenotypeContainerDiploidRLE* GetDiploidRLEU64(const U32 position){ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidSimple* GetDiploidSimpleByte(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidSimple* GetDiploidSimpleU16(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidSimple* GetDiploidSimpleU32(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidSimple* GetDiploidSimpleU64(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidRLE* GetDiploidRLEByte(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidRLE* GetDiploidRLEU16(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidRLE* GetDiploidRLEU32(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } + inline const GenotypeContainerDiploidRLE* GetDiploidRLEU64(const U32 position) const{ return(reinterpret_cast*>(&this->__iterators[position])); } private: - template inline const U32 getNative(const buffer_type& buffer, const U32 position) const{ + template inline U32 getNative(const buffer_type& buffer, const U32 position) const{ return(*reinterpret_cast(&buffer.buffer[position*sizeof(intrinsic_primitive)])); } diff --git a/lib/containers/genotype_container_diploid_bcf.h b/lib/containers/genotype_container_diploid_bcf.h index a36bbbe..a4baad6 100644 --- a/lib/containers/genotype_container_diploid_bcf.h +++ b/lib/containers/genotype_container_diploid_bcf.h @@ -37,18 +37,26 @@ class GenotypeContainerDiploidBCF : public GenotypeContainerInterface{ // GT-specific U32 getSum(void) const; square_matrix_type& comparePairwise(square_matrix_type& square_matrix) const; - std::vector getLiteralObjects(void) const; - std::vector getObjects(const U64& n_samples) const; - std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const; + //std::vector getLiteralObjects(void) const; + //std::vector getObjects(const U64& n_samples) const; + //std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const; //bool getObjects(const U64& n_samples, std::vector& objects) const; //bool getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; - void getObjects(const U64& n_samples, std::vector& objects) const; - void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; - void getLiteralObjects(std::vector& objects) const; + //void getObjects(const U64& n_samples, std::vector& objects) const; + //void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; + //void getLiteralObjects(std::vector& objects) const; gt_summary& updateSummary(gt_summary& gt_summary_object) const; gt_summary getSummary(void) const; gt_summary& getSummary(gt_summary& gt_summary_object) const; void getTsTv(std::vector& objects) const; + + yon_gt* GetObjects(const uint32_t n_samples){ + return(nullptr); + } + + yon_gt* GetObjects(yon_gt_ppa& ppa){ + return(nullptr); + } }; @@ -84,6 +92,7 @@ math::SquareMatrix& GenotypeContainerDiploidBCF::comparePairwise(squa return(square_matrix); } +/* template std::vector GenotypeContainerDiploidBCF::getLiteralObjects(void) const{ std::vector ret(this->n_entries); @@ -205,24 +214,21 @@ void GenotypeContainerDiploidBCF::getObjects(const U64& n_samples, std::vecto entries[ppa_manager[i]].n_ploidy = 2; } } +*/ template -GenotypeSummary& GenotypeContainerDiploidBCF::updateSummary(gt_summary& gt_summary_object) const{ - gt_summary_object += *this; +yon_gt_summary& GenotypeContainerDiploidBCF::updateSummary(gt_summary& gt_summary_object) const{ return(gt_summary_object); } template -GenotypeSummary GenotypeContainerDiploidBCF::getSummary(void) const{ +yon_gt_summary GenotypeContainerDiploidBCF::getSummary(void) const{ gt_summary summary; - summary += *this; return(summary); } template -GenotypeSummary& GenotypeContainerDiploidBCF::getSummary(gt_summary& gt_summary_object) const{ - gt_summary_object.clear(); - gt_summary_object += *this; +yon_gt_summary& GenotypeContainerDiploidBCF::getSummary(gt_summary& gt_summary_object) const{ return(gt_summary_object); } @@ -232,7 +238,7 @@ void GenotypeContainerDiploidBCF::getTsTv(std::vector& obj return; // Has to be a SNV and biallelic - if(this->getMeta().isBiallelicSNV() == false) return; + if(this->getMeta().IsBiallelicSNV() == false) return; const BYTE shift = (sizeof(T)*8 - 1) / 2; diff --git a/lib/containers/genotype_container_diploid_rle.h b/lib/containers/genotype_container_diploid_rle.h index bc2ae28..98f37da 100644 --- a/lib/containers/genotype_container_diploid_rle.h +++ b/lib/containers/genotype_container_diploid_rle.h @@ -37,17 +37,55 @@ class GenotypeContainerDiploidRLE : public GenotypeContainerInterface{ // GT-specific U32 getSum(void) const; square_matrix_type& comparePairwise(square_matrix_type& square_matrix) const; - std::vector getLiteralObjects(void) const; - std::vector getObjects(const U64& n_samples) const; - std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const; - void getObjects(const U64& n_samples, std::vector& objects) const; - void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; - void getLiteralObjects(std::vector& objects) const; + //std::vector getLiteralObjects(void) const; + //std::vector getObjects(const U64& n_samples) const; + //std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const; + //void getObjects(const U64& n_samples, std::vector& objects) const; + //void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; + //void getLiteralObjects(std::vector& objects) const; gt_summary& updateSummary(gt_summary& gt_summary_object) const; gt_summary getSummary(void) const; gt_summary& getSummary(gt_summary& gt_summary_object) const; void getTsTv(std::vector& objects) const; + + yon_gt* GetObjects(const uint32_t n_samples){ + yon_gt* x = new yon_gt; + x->shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + x->add = this->__meta.IsGTMixedPhasing() ? 1 : 0; + x->global_phase = this->__meta.GetControllerPhase(); + x->data = this->__data; + x->n_i = this->n_entries; + x->method = 1; + x->p = sizeof(T); + x->m = 2; + x->n_s = n_samples; + x->n_allele = this->__meta.n_alleles; + x->ppa = nullptr; + + assert(this->__meta.n_base_ploidy == 2); + + return(x); + } + + yon_gt* GetObjects(yon_gt_ppa& ppa){ + yon_gt* x = new yon_gt; + x->shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + x->add = this->__meta.IsGTMixedPhasing() ? 1 : 0; + x->global_phase = this->__meta.GetControllerPhase(); + x->data = this->__data; + x->m = 2; + x->p = sizeof(T); + x->n_i = this->n_entries; + x->method = 1; + x->n_s = ppa.n_samples; + x->ppa = &ppa; + x->n_allele = this->__meta.n_alleles; + + assert(this->__meta.n_base_ploidy == 2); + + return(x); + } }; @@ -72,11 +110,11 @@ GenotypeContainerDiploidRLE::~GenotypeContainerDiploidRLE(){ } template U32 GenotypeContainerDiploidRLE::getSum(void) const{ U32 count = 0; - const BYTE shift = this->__meta.isAnyGTMissing() ? 2 : 1; - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; for(U32 i = 0; i < this->n_entries; ++i) - count += YON_GT_RLE_LENGTH(this->at(i), shift, add); + count += YON_GT_RLE_LENGTH(this->at(i), shift, add); return(count); } @@ -84,13 +122,13 @@ U32 GenotypeContainerDiploidRLE::getSum(void) const{ template math::SquareMatrix& GenotypeContainerDiploidRLE::comparePairwise(square_matrix_type& square_matrix) const{ // Has to be a SNV - if(this->getMeta().isBiallelicSNV() == false){ + if(this->getMeta().IsBiallelicSNV() == false){ //std::cerr << "skipping" << std::endl; return square_matrix; } - const BYTE shift = this->__meta.isAnyGTMissing() ? 2 : 1; - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 start_position = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -138,10 +176,11 @@ math::SquareMatrix& GenotypeContainerDiploidRLE::comparePairwise(squa return(square_matrix); } +/* template std::vector GenotypeContainerDiploidRLE::getLiteralObjects(void) const{ std::vector ret(this->n_entries); - tachyon::core::GTObjectDiploidRLE* entries = reinterpret_cast(&ret[0]); + core::GTObjectDiploidRLE* entries = reinterpret_cast(&ret[0]); for(U32 i = 0; i < this->n_entries; ++i) entries[i](this->at(i), this->__meta); @@ -151,10 +190,10 @@ std::vector GenotypeContainerDiploidRLE::getLiteralO template std::vector GenotypeContainerDiploidRLE::getObjects(const U64& n_samples) const{ std::vector ret(n_samples); - tachyon::core::GTObjectDiploidRLE* entries = reinterpret_cast(&ret[0]); + core::GTObjectDiploidRLE* entries = reinterpret_cast(&ret[0]); - const BYTE shift = this->__meta.isAnyGTMissing() ? 2 : 1; - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -164,7 +203,7 @@ std::vector GenotypeContainerDiploidRLE::getObjects( BYTE phasing = 0; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[cum_pos].alleles = new core::GTObjectAllele[2]; @@ -183,10 +222,10 @@ std::vector GenotypeContainerDiploidRLE::getObjects( template std::vector GenotypeContainerDiploidRLE::getObjects(const U64& n_samples, const permutation_type& ppa_manager) const{ std::vector ret(n_samples); - tachyon::core::GTObjectDiploidRLE* entries = reinterpret_cast(&ret[0]); + core::GTObjectDiploidRLE* entries = reinterpret_cast(&ret[0]); - const BYTE shift = this->__meta.isAnyGTMissing() ? 2 : 1; - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -198,7 +237,7 @@ std::vector GenotypeContainerDiploidRLE::getObjects( BYTE phasing = 0; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[ppa_manager[cum_pos]].alleles = new core::GTObjectAllele[2]; @@ -218,9 +257,18 @@ std::vector GenotypeContainerDiploidRLE::getObjects( template void GenotypeContainerDiploidRLE::getLiteralObjects(std::vector& objects) const{ if(objects.size() < this->size()) objects.resize(this->size()); - tachyon::core::GTObjectDiploidRLE* entries = reinterpret_cast(&objects[0]); - for(U32 i = 0; i < this->size(); ++i) - entries[i](this->at(i), this->__meta); + core::GTObjectDiploidRLE* entries = reinterpret_cast(&objects[0]); + + const BYTE shift = this->__meta.IsAnyGTMissing() + 1; + + if(this->__meta.IsGTMixedPhasing()){ + for(U32 i = 0; i < this->size(); ++i) + yon_gt_rle_obj::Evaluate(entries[i], shift, this->at(i)); + + } else { + for(U32 i = 0; i < this->size(); ++i) + yon_gt_rle_obj::Evaluate(entries[i], shift, this->__meta.GetControllerPhase(), this->at(i)); + } } // Todo @@ -231,10 +279,10 @@ void GenotypeContainerDiploidRLE::getObjects(const U64& n_samples, std::vecto for(U32 i = 0; i < n_samples; ++i) objects[i].alleles = new core::GTObjectAllele[2]; } - tachyon::core::GTObjectDiploidRLE* entries = reinterpret_cast(&objects[0]); + core::GTObjectDiploidRLE* entries = reinterpret_cast(&objects[0]); - const BYTE shift = this->__meta.isAnyGTMissing() ? 2 : 1; - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; S32 alleleA, alleleB; @@ -244,11 +292,13 @@ void GenotypeContainerDiploidRLE::getObjects(const U64& n_samples, std::vecto length = YON_GT_RLE_LENGTH(this->at(i), shift, add); alleleA = YON_GT_RLE_ALLELE_A(this->at(i), shift, add); alleleB = YON_GT_RLE_ALLELE_B(this->at(i), shift, add); - alleleA -= core::YON_GT_RLE_CORRECTION[alleleA]; - alleleB -= core::YON_GT_RLE_CORRECTION[alleleB]; + //alleleA -= core::YON_GT_RLE_CORRECTION[alleleA]; + //alleleB -= core::YON_GT_RLE_CORRECTION[alleleB]; + alleleA = core::YON_GT_RLE_RECODE[alleleA]; + alleleB = core::YON_GT_RLE_RECODE[alleleB]; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[cum_pos].alleles[0].allele = alleleA; @@ -271,10 +321,10 @@ void GenotypeContainerDiploidRLE::getObjects(const U64& n_samples, for(U32 i = 0; i < n_samples; ++i) objects[i].alleles = new core::GTObjectAllele[2]; } - tachyon::core::GTObjectDiploidRLE* entries = reinterpret_cast(&objects[0]); + core::GTObjectDiploidRLE* entries = reinterpret_cast(&objects[0]); - const BYTE shift = this->__meta.isAnyGTMissing() ? 2 : 1; - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; S32 alleleA, alleleB; @@ -290,7 +340,7 @@ void GenotypeContainerDiploidRLE::getObjects(const U64& n_samples, alleleB = core::YON_GT_RLE_RECODE[alleleB]; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[ppa_manager[cum_pos]].alleles[0].allele = alleleA; @@ -302,33 +352,30 @@ void GenotypeContainerDiploidRLE::getObjects(const U64& n_samples, } } } +*/ template -GenotypeSummary& GenotypeContainerDiploidRLE::updateSummary(gt_summary& gt_summary_object) const{ - gt_summary_object += *this; +yon_gt_summary& GenotypeContainerDiploidRLE::updateSummary(gt_summary& gt_summary_object) const{ return(gt_summary_object); } template -GenotypeSummary GenotypeContainerDiploidRLE::getSummary(void) const{ +yon_gt_summary GenotypeContainerDiploidRLE::getSummary(void) const{ gt_summary summary; - summary += *this; return(summary); } template -GenotypeSummary& GenotypeContainerDiploidRLE::getSummary(gt_summary& gt_summary_object) const{ - gt_summary_object.clear(); - gt_summary_object += *this; +yon_gt_summary& GenotypeContainerDiploidRLE::getSummary(gt_summary& gt_summary_object) const{ return(gt_summary_object); } template void GenotypeContainerDiploidRLE::getTsTv(std::vector& objects) const{ if(this->size() == 0) return; - if(this->getMeta().isDiploid() == false) return; + if(this->getMeta().IsDiploid() == false) return; if(this->getMeta().alleles[0].size() != 1) return; - assert(this->getMeta().getNumberAlleles() == 2); + assert(this->getMeta().GetNumberAlleles() == 2); BYTE references[10]; switch(this->getMeta().alleles[0].allele[0]){ @@ -364,8 +411,8 @@ void GenotypeContainerDiploidRLE::getTsTv(std::vector& obj const BYTE* const transition_map_target = constants::TRANSITION_MAP[references[0]]; const BYTE* const transversion_map_target = constants::TRANSVERSION_MAP[references[0]]; - const BYTE shift = this->__meta.isAnyGTMissing() ? 2 : 1; - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; // Cycle over genotype objects U32 cum_position = 0; diff --git a/lib/containers/genotype_container_diploid_simple.h b/lib/containers/genotype_container_diploid_simple.h index c807deb..2db26b1 100644 --- a/lib/containers/genotype_container_diploid_simple.h +++ b/lib/containers/genotype_container_diploid_simple.h @@ -46,16 +46,54 @@ class GenotypeContainerDiploidSimple : public GenotypeContainerInterface{ // GT-specific U32 getSum(void) const; square_matrix_type& comparePairwise(square_matrix_type& square_matrix) const; - std::vector getLiteralObjects(void) const; - std::vector getObjects(const U64& n_samples) const; - std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const; - void getObjects(const U64& n_samples, std::vector& objects) const; - void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; - void getLiteralObjects(std::vector& objects) const; + //std::vector getLiteralObjects(void) const; + //std::vector getObjects(const U64& n_samples) const; + //std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const; + //void getObjects(const U64& n_samples, std::vector& objects) const; + //void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; + //void getLiteralObjects(std::vector& objects) const; gt_summary& updateSummary(gt_summary& gt_summary_object) const; gt_summary getSummary(void) const; gt_summary& getSummary(gt_summary& gt_summary_object) const; void getTsTv(std::vector& objects) const; + + yon_gt* GetObjects(const uint32_t n_samples){ + yon_gt* x = new yon_gt; + x->n_allele = this->__meta.n_alleles; + x->shift = ceil(log2(x->n_allele + 2 + 1)); + x->add = this->__meta.IsGTMixedPhasing() ? 1 : 0; + x->global_phase = this->__meta.GetControllerPhase(); + x->data = this->__data; + x->n_i = this->n_entries; + x->method = 2; + x->p = sizeof(T); + x->m = 2; + x->n_s = n_samples; + x->ppa = nullptr; + + assert(this->__meta.n_base_ploidy == 2); + + return(x); + } + + yon_gt* GetObjects(yon_gt_ppa& ppa){ + yon_gt* x = new yon_gt; + x->n_allele = this->__meta.n_alleles; + x->shift = ceil(log2(x->n_allele + 2 + 1)); + x->add = this->__meta.IsGTMixedPhasing() ? 1 : 0; + x->global_phase = this->__meta.GetControllerPhase(); + x->data = this->__data; + x->m = 2; + x->p = sizeof(T); + x->n_i = this->n_entries; + x->method = 2; + x->n_s = ppa.n_samples; + x->ppa = &ppa; + + assert(this->__meta.n_base_ploidy == 2); + + return(x); + } }; @@ -82,8 +120,8 @@ template U32 GenotypeContainerDiploidSimple::getSum(void) const{ U32 count = 0; - const BYTE shift = ceil(log2(this->__meta.getNumberAlleles() + 1 + this->__meta.isAnyGTMissing())); // Bits occupied per allele, 1 value for missing - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = ceil(log2(this->__meta.GetNumberAlleles() + 1 + this->__meta.IsAnyGTMissing())); // Bits occupied per allele, 1 value for missing + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; for(U32 i = 0; i < this->n_entries; ++i) count += YON_GT_RLE_LENGTH(this->at(i), shift, add); @@ -93,8 +131,8 @@ U32 GenotypeContainerDiploidSimple::getSum(void) const{ template math::SquareMatrix& GenotypeContainerDiploidSimple::comparePairwise(square_matrix_type& square_matrix) const{ - const BYTE shift = ceil(log2(this->__meta.getNumberAlleles() + 1 + this->__meta.isAnyGTMissing())); // Bits occupied per allele, 1 value for missing - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = ceil(log2(this->__meta.GetNumberAlleles() + 1 + this->__meta.IsAnyGTMissing())); // Bits occupied per allele, 1 value for missing + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 start_position = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -145,6 +183,7 @@ math::SquareMatrix& GenotypeContainerDiploidSimple::compare return(square_matrix); } +/* template std::vector GenotypeContainerDiploidSimple::getLiteralObjects(void) const{ std::vector ret(this->n_entries); @@ -160,8 +199,8 @@ std::vector GenotypeContainerDiploidSimple std::vector ret(n_samples); tachyon::core::GTObjectDiploidSimple* entries = reinterpret_cast(&ret[0]); - const BYTE shift = ceil(log2(this->__meta.getNumberAlleles() + 1 + this->__meta.isAnyGTMissing() + this->__meta.isMixedPloidy())); // Bits occupied per allele, 1 value for missing - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = ceil(log2(this->__meta.GetNumberAlleles() + 1 + this->__meta.IsAnyGTMissing() + this->__meta.IsMixedPloidy())); // Bits occupied per allele, 1 value for missing + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -172,7 +211,7 @@ std::vector GenotypeContainerDiploidSimple BYTE phasing = 0; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[cum_pos].alleles = new core::GTObjectAllele[2]; @@ -192,8 +231,8 @@ std::vector GenotypeContainerDiploidSimple std::vector ret(n_samples); tachyon::core::GTObjectDiploidSimple* entries = reinterpret_cast(&ret[0]); - const BYTE shift = ceil(log2(this->__meta.getNumberAlleles() + 1 + this->__meta.isAnyGTMissing() + this->__meta.isMixedPloidy())); // Bits occupied per allele, 1 value for missing - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = ceil(log2(this->__meta.GetNumberAlleles() + 1 + this->__meta.IsAnyGTMissing() + this->__meta.IsMixedPloidy())); // Bits occupied per allele, 1 value for missing + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -204,7 +243,7 @@ std::vector GenotypeContainerDiploidSimple BYTE phasing = 0; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[ppa_manager[cum_pos]].alleles = new core::GTObjectAllele[2]; @@ -239,8 +278,8 @@ void GenotypeContainerDiploidSimple::getObjects(const U64& n_sample } tachyon::core::GTObjectDiploidSimple* entries = reinterpret_cast(&objects[0]); - const BYTE shift = ceil(log2(this->__meta.getNumberAlleles() + 2 + 1)); // Bits occupied per allele, 1 value for missing - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = ceil(log2(this->__meta.GetNumberAlleles() + 2 + 1)); // Bits occupied per allele, 1 value for missing + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -251,7 +290,7 @@ void GenotypeContainerDiploidSimple::getObjects(const U64& n_sample BYTE phasing = 0; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[cum_pos].alleles[0].allele = alleleA; @@ -275,8 +314,8 @@ void GenotypeContainerDiploidSimple::getObjects(const U64& n_sample } tachyon::core::GTObjectDiploidSimple* entries = reinterpret_cast(&objects[0]); - const BYTE shift = ceil(log2(this->__meta.getNumberAlleles() + 2 + 1)); // Bits occupied per allele, 1 value for missing - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = ceil(log2(this->__meta.GetNumberAlleles() + 2 + 1)); // Bits occupied per allele, 1 value for missing + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; U32 cum_pos = 0; for(U32 i = 0; i < this->n_entries; ++i){ @@ -287,7 +326,7 @@ void GenotypeContainerDiploidSimple::getObjects(const U64& n_sample BYTE phasing = 0; if(add) phasing = this->at(i) & 1; - else phasing = this->__meta.getControllerPhase(); + else phasing = this->__meta.GetControllerPhase(); for(U32 j = 0; j < length; ++j, cum_pos++){ entries[ppa_manager[cum_pos]].alleles[0].allele = alleleA; @@ -299,41 +338,38 @@ void GenotypeContainerDiploidSimple::getObjects(const U64& n_sample } } } +*/ template -GenotypeSummary& GenotypeContainerDiploidSimple::updateSummary(gt_summary& gt_summary_object) const{ - gt_summary_object += *this; +yon_gt_summary& GenotypeContainerDiploidSimple::updateSummary(gt_summary& gt_summary_object) const{ return(gt_summary_object); } template -GenotypeSummary GenotypeContainerDiploidSimple::getSummary(void) const{ +yon_gt_summary GenotypeContainerDiploidSimple::getSummary(void) const{ gt_summary summary; - summary += *this; return(summary); } template -GenotypeSummary& GenotypeContainerDiploidSimple::getSummary(gt_summary& gt_summary_object) const{ - gt_summary_object.clear(); - gt_summary_object += *this; +yon_gt_summary& GenotypeContainerDiploidSimple::getSummary(gt_summary& gt_summary_object) const{ return(gt_summary_object); } template void GenotypeContainerDiploidSimple::getTsTv(std::vector& objects) const{ if(this->size() == 0) return; - if(this->getMeta().isDiploid() == false) return; + if(this->getMeta().IsDiploid() == false) return; if(this->getMeta().alleles[0].size() != 1) return; // If alleleA/B == ref then update self // If allele != ref then update ref->observed - const U32 n_references = this->getMeta().getNumberAlleles() + 1 + this->__meta.isMixedPloidy(); + const U32 n_references = this->getMeta().GetNumberAlleles() + 1 + this->__meta.IsMixedPloidy(); BYTE* references = new BYTE[n_references]; references[0] = constants::REF_ALT_MISSING; - references[1 + this->__meta.isMixedPloidy()] = constants::REF_ALT_MISSING; - U32 start_reference = 1 + this->__meta.isMixedPloidy(); + references[1 + this->__meta.IsMixedPloidy()] = constants::REF_ALT_MISSING; + U32 start_reference = 1 + this->__meta.IsMixedPloidy(); switch(this->getMeta().alleles[0].allele[0]){ case('A'): references[start_reference] = constants::REF_ALT_A; break; @@ -351,7 +387,7 @@ void GenotypeContainerDiploidSimple::getTsTv(std::vectorgetMeta().getNumberAlleles(); ++i, start_reference++){ + for(U32 i = 1; i < this->getMeta().GetNumberAlleles(); ++i, start_reference++){ if(this->getMeta().alleles[i].size() != 1){ //references[i] = constants::REF_ALT_INSERTION; references[start_reference] = constants::REF_ALT_MISSING; @@ -369,8 +405,8 @@ void GenotypeContainerDiploidSimple::getTsTv(std::vector__meta.getNumberAlleles() + 1 + this->__meta.isAnyGTMissing() + this->__meta.isMixedPloidy())); // Bits occupied per allele, 1 value for missing - const BYTE add = this->__meta.isGTMixedPhasing() ? 1 : 0; + const BYTE shift = ceil(log2(this->__meta.GetNumberAlleles() + 1 + this->__meta.IsAnyGTMissing() + this->__meta.IsMixedPloidy())); // Bits occupied per allele, 1 value for missing + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; // Cycle over genotype objects U32 cum_position = 0; diff --git a/lib/containers/genotype_container_interface.h b/lib/containers/genotype_container_interface.h index dfbf937..338b5ec 100644 --- a/lib/containers/genotype_container_interface.h +++ b/lib/containers/genotype_container_interface.h @@ -4,26 +4,20 @@ #include "data_container.h" #include "core/ts_tv_object.h" #include "math/square_matrix.h" -#include "algorithm/permutation/permutation_manager.h" -#include "core/genotype_object.h" -#include "core/genotype_summary.h" +#include "core/genotypes.h" namespace tachyon{ namespace containers{ // Todo: value type should be GT object class GenotypeContainerInterface{ -private: +public: typedef GenotypeContainerInterface self_type; typedef std::size_t size_type; - -protected: typedef core::MetaEntry meta_type; typedef core::VariantController hot_controller_type; - typedef core::GTObject gt_object; - typedef GenotypeSummary gt_summary; + typedef yon_gt_summary gt_summary; typedef math::SquareMatrix square_matrix_type; - typedef algorithm::PermutationManager permutation_type; typedef core::TsTvObject ts_tv_object_type; // Function pointers @@ -37,7 +31,7 @@ class GenotypeContainerInterface{ GenotypeContainerInterface(const char* const data, const size_type& n_entries, const U32& n_bytes, const meta_type& meta) : n_entries(n_entries), - __data(new char[n_bytes]), + __data(new uint8_t[n_bytes]), __meta(meta) { memcpy(this->__data, data, n_bytes); @@ -61,17 +55,19 @@ class GenotypeContainerInterface{ virtual gt_summary getSummary(void) const =0; virtual gt_summary& getSummary(gt_summary& gt_summary_object) const =0; + virtual yon_gt* GetObjects(const uint32_t n_samples) =0; + virtual yon_gt* GetObjects(yon_gt_ppa& ppa) =0; - virtual std::vector getLiteralObjects(void) const =0; - virtual std::vector getObjects(const U64& n_samples) const =0; - virtual std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const =0; - virtual void getObjects(const U64& n_samples, std::vector& objects) const =0; - virtual void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const =0; - virtual void getLiteralObjects(std::vector& objects) const =0; + //virtual std::vector getLiteralObjects(void) const =0; + //virtual std::vector getObjects(const U64& n_samples) const =0; + //virtual std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const =0; + //virtual void getObjects(const U64& n_samples, std::vector& objects) const =0; + //virtual void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const =0; + //virtual void getLiteralObjects(std::vector& objects) const =0; virtual void getTsTv(std::vector& objects) const =0; // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } inline const meta_type& getMeta(void) const{ return(this->__meta); } @@ -103,7 +99,7 @@ class GenotypeContainerInterface{ protected: size_type n_entries; - char* __data; + uint8_t* __data; const meta_type __meta; }; diff --git a/lib/containers/genotype_container_nploid.h b/lib/containers/genotype_container_nploid.h new file mode 100644 index 0000000..5337fff --- /dev/null +++ b/lib/containers/genotype_container_nploid.h @@ -0,0 +1,151 @@ +#ifndef CONTAINERS_GENOTYPE_CONTAINER_NPLOID_H_ +#define CONTAINERS_GENOTYPE_CONTAINER_NPLOID_H_ + +#include "genotype_container_interface.h" + +namespace tachyon{ +namespace containers{ + +template +class GenotypeContainerNploid : public GenotypeContainerInterface{ +private: + typedef GenotypeContainerInterface parent_type; + typedef GenotypeContainerNploid self_type; + typedef T value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + +public: + GenotypeContainerNploid(); + GenotypeContainerNploid(const char* const data, const U32 n_entries, const meta_type& meta_entry); + ~GenotypeContainerNploid(); + + // Element access + inline reference at(const size_type& position){ return(*reinterpret_cast(&this->__data[position * sizeof(value_type)])); } + inline const_reference at(const size_type& position) const{ return(*reinterpret_cast(&this->__data[position * sizeof(value_type)])); } + inline reference operator[](const size_type& position){ return(*reinterpret_cast(&this->__data[position * sizeof(value_type)])); } + inline const_reference operator[](const size_type& position) const{ return(*reinterpret_cast(&this->__data[position * sizeof(value_type)])); } + inline reference front(void){ return(*reinterpret_cast(&this->__data[0])); } + inline const_reference front(void) const{ return(*reinterpret_cast(&this->__data[0])); } + inline reference back(void){ return(*reinterpret_cast(&this->__data[(this->n_entries - 1) * sizeof(value_type)])); } + inline const_reference back(void) const{ return(*reinterpret_cast(&this->__data[(this->n_entries - 1) * sizeof(value_type)])); } + + // GT-specific + U32 getSum(void) const; + square_matrix_type& comparePairwise(square_matrix_type& square_matrix) const; + //std::vector getLiteralObjects(void) const; + //std::vector getObjects(const U64& n_samples) const; + //std::vector getObjects(const U64& n_samples, const permutation_type& ppa_manager) const; + //void getObjects(const U64& n_samples, std::vector& objects) const; + //void getObjects(const U64& n_samples, std::vector& objects, const permutation_type& ppa_manager) const; + //void getLiteralObjects(std::vector& objects) const; + + gt_summary& updateSummary(gt_summary& gt_summary_object) const; + gt_summary getSummary(void) const; + gt_summary& getSummary(gt_summary& gt_summary_object) const; + void getTsTv(std::vector& objects) const; + + yon_gt* GetObjects(const uint32_t n_samples){ + yon_gt* x = new yon_gt; + x->shift = 0; + x->add = this->__meta.IsGTMixedPhasing(); + x->global_phase = this->__meta.GetControllerPhase(); + x->data = this->__data; + x->n_i = this->n_entries; + x->method = 4; + x->m = this->__meta.n_base_ploidy; + x->p = sizeof(T); + x->n_s = n_samples; + x->n_allele = this->__meta.n_alleles; + x->ppa = nullptr; + + return(x); + } + + yon_gt* GetObjects(yon_gt_ppa& ppa){ + yon_gt* x = new yon_gt; + x->shift = 0; + x->add = this->__meta.IsGTMixedPhasing(); + x->global_phase = this->__meta.GetControllerPhase(); + x->data = this->__data; + x->m = this->__meta.n_base_ploidy; + x->p = sizeof(T); + x->n_i = this->n_entries; + x->method = 4; + x->n_s = ppa.n_samples; + x->ppa = &ppa; + x->n_allele = this->__meta.n_alleles; + + return(x); + } +}; + + +// IMPLEMENTATION ------------------------------------------------------------- + + +template +GenotypeContainerNploid::GenotypeContainerNploid(){ + +} + +template +GenotypeContainerNploid::GenotypeContainerNploid(const char* const data, const U32 n_entries, const meta_type& meta_entry) : + parent_type(data, n_entries, n_entries*(sizeof(value_type) + meta_entry.n_base_ploidy*sizeof(uint8_t)), meta_entry) +{ + +} + +template +GenotypeContainerNploid::~GenotypeContainerNploid(){ } + +template +U32 GenotypeContainerNploid::getSum(void) const{ + U32 count = 0; + const BYTE shift = this->__meta.IsAnyGTMissing() ? 2 : 1; + const BYTE add = this->__meta.IsGTMixedPhasing() ? 1 : 0; + + for(U32 i = 0; i < this->n_entries; ++i) + count += YON_GT_RLE_LENGTH(this->at(i), shift, add); + + return(count); +} + +template +math::SquareMatrix& GenotypeContainerNploid::comparePairwise(square_matrix_type& square_matrix) const{ + return(square_matrix); +} + + +template +yon_gt_summary& GenotypeContainerNploid::updateSummary(gt_summary& gt_summary_object) const{ + //gt_summary_object += *this; + return(gt_summary_object); +} + +template +yon_gt_summary GenotypeContainerNploid::getSummary(void) const{ + gt_summary summary; + //summary += *this; + return(summary); +} + +template +yon_gt_summary& GenotypeContainerNploid::getSummary(gt_summary& gt_summary_object) const{ + //gt_summary_object += *this; + return(gt_summary_object); +} + +template +void GenotypeContainerNploid::getTsTv(std::vector& objects) const{ + return; +} + +} +} + +#endif /* CONTAINERS_GENOTYPE_CONTAINER_NPLOID_H_ */ diff --git a/lib/containers/hash_container.h b/lib/containers/hash_container.h deleted file mode 100644 index 94f10d6..0000000 --- a/lib/containers/hash_container.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef CORE_HASHCONTAINER_H_ -#define CORE_HASHCONTAINER_H_ - -#include "algorithm/OpenHashTable.h" - -namespace tachyon{ -namespace containers{ - -class HashContainer{ - typedef HashContainer self_type; - typedef hash::HashTable hash_table; - typedef std::vector id_vector; - -public: - HashContainer() : htable(65536, 250){} - ~HashContainer(){} - - inline const bool get(const U32& value, U32& ret){ - U32* ret2 = nullptr; - if(this->htable.GetItem(&value, ret2, sizeof(U32))){ - ret = this->data[*ret2]; - return true; - } - return false; - } - - inline bool getRaw(const U32& value, U32& ret){ - U32* ret2 = nullptr; - if(this->htable.GetItem(&value, ret2, sizeof(U32))){ - ret = *ret2; - return true; - } - return false; - } - - inline bool set(const U32& value){ - U32 tot = this->data.size(); - if(!this->htable.SetItem(&value, tot, sizeof(U32))) - return false; - - this->data.push_back(value); - return true; - } - - inline U32 setGet(const U32& value){ - U32* ret = nullptr; - if(this->htable.GetItem(&value, ret, sizeof(U32))){ - return(*ret); - } else { - U32 tot = this->data.size(); - if(!this->htable.SetItem(&value, tot, sizeof(U32))) - return false; - - this->data.push_back(value); - return(tot); - } - } - - inline const size_t size(void) const{ return(this->data.size()); } - inline const U32& operator[](const U32& p) const{ return(this->data[p]); } - inline void clear(void){ - this->htable.clear(); - this->data.clear(); - } - -private: - id_vector data; - hash_table htable; -}; - -class HashVectorContainer{ - typedef HashVectorContainer self_type; - typedef hash::HashTable hash_table; - typedef std::vector< std::vector > id_vector; - -public: - HashVectorContainer() : htable(65536, 250){} - ~HashVectorContainer(){} - - inline const bool get(const U64& value, std::vector& ret) const{ - U32* ret2 = nullptr; - if(this->htable.GetItem(&value, ret2, sizeof(U64))){ - ret = this->data[*ret2]; - return true; - } - return false; - } - - inline const bool getRaw(const U64& value, U32& ret) const{ - U32* ret2 = nullptr; - if(this->htable.GetItem(&value, ret2, sizeof(U64))){ - ret = *ret2; - return true; - } - return false; - } - - inline bool set(const std::vector& value, const U64& hashValue){ - U32 tot = this->data.size(); - if(!this->htable.SetItem(&hashValue, tot, sizeof(U64))){ - return false; - } - this->data.push_back(value); - return true; - } - - inline U32 setGet(const std::vector& value, const U64& hashValue){ - U32* ret = nullptr; - if(this->htable.GetItem(&hashValue, ret, sizeof(U64))){ - return(*ret); - } else { - U32 tot = this->data.size(); - if(!this->htable.SetItem(&hashValue, tot, sizeof(U64))){ - return false; - } - this->data.push_back(value); - return(tot); - } - } - - inline const size_t size(void) const{ return(this->data.size()); } - inline const std::vector& operator[](const U32& p) const{ return(this->data[p]); } - inline void clear(void){ - this->htable.clear(); - this->data.clear(); - } - -private: - id_vector data; - hash_table htable; -}; - -} -} - -#endif /* CORE_HASHCONTAINER_H_ */ diff --git a/lib/containers/info_container.h b/lib/containers/info_container.h index c7533d2..46f1abe 100644 --- a/lib/containers/info_container.h +++ b/lib/containers/info_container.h @@ -27,46 +27,16 @@ class InfoContainer : public InfoContainerInterface{ typedef MetaContainer meta_container_type; typedef StrideContainer stride_container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: InfoContainer(); + InfoContainer(const bool is_flag); InfoContainer(const data_container_type& container); InfoContainer(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches); ~InfoContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__containers[position]); } inline const_reference at(const size_type& position) const{ return(this->__containers[position]); } @@ -88,19 +58,22 @@ class InfoContainer : public InfoContainerInterface{ inline const_iterator cend() const{ return const_iterator(&this->__containers[this->n_entries]); } // Type-specific - inline std::ostream& to_vcf_string(std::ostream& stream, const U32 position) const{ utility::to_vcf_string(stream, this->at(position)); return(stream); } + inline std::ostream& to_vcf_string(std::ostream& stream, const U32 position) const{ + //utility::to_vcf_string(stream, this->at(position).data(), this->at(position).size()); + return(stream); + } inline io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position) const{ - utility::to_vcf_string(buffer, this->at(position)); + utility::to_vcf_string(buffer, this->at(position).data(), this->at(position).size()); return(buffer); } inline io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const U32 position) const{ - utility::to_json_string(buffer, this->at(position)); + //utility::to_json_string(buffer, this->at(position)); return(buffer); } - const bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } + inline bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } private: // For mixed strides @@ -110,6 +83,9 @@ class InfoContainer : public InfoContainerInterface{ template void __setupBalanced(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches); + void __setupBalancedFlag(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches); + + // For fixed strides template void __setup(const data_container_type& container, const U32 stride_size); @@ -132,56 +108,86 @@ InfoContainer::InfoContainer(void) : } +template +InfoContainer::InfoContainer(const bool is_flag) : + __containers(static_cast(::operator new[](1*sizeof(value_type)))) +{ + this->n_entries = 1; + this->n_capacity = 1; + // Set the primitive container value to 0. This + // is required for the yon1_t structures to point + // to something that is not simply a nullpointer. + // It Has no other practical uses. + new( &this->__containers[0] ) value_type( 0 ); +} + template InfoContainer::InfoContainer(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches) : __containers(nullptr) { - if(data_container.buffer_data_uncompressed.size() == 0) + if(data_container.buffer_data_uncompressed.size() == 0 && data_container.header.data_header.GetPrimitiveType() != YON_TYPE_BOOLEAN){ return; + } - if(data_container.header.data_header.hasMixedStride()){ - if(data_container.header.data_header.isSigned()){ - switch(data_container.header.data_header.getPrimitiveType()){ + if(data_container.header.data_header.HasMixedStride()){ + if(data_container.header.data_header.IsSigned()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; + case(YON_TYPE_BOOLEAN):(this->__setupBalancedFlag(data_container, meta_container, pattern_matches)); break; + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } else { - switch(data_container.header.data_header.getPrimitiveType()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches)); break; + case(YON_TYPE_BOOLEAN):(this->__setupBalancedFlag(data_container, meta_container, pattern_matches)); break; + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } } else { - if(data_container.header.data_header.isSigned()){ - switch(data_container.header.data_header.getPrimitiveType()){ + if(data_container.header.data_header.IsSigned()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; + case(YON_TYPE_BOOLEAN):(this->__setupBalancedFlag(data_container, meta_container, pattern_matches)); break; + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } else { - switch(data_container.header.data_header.getPrimitiveType()){ + switch(data_container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_16B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_32B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_64B): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_FLOAT): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; case(YON_TYPE_DOUBLE): (this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride)); break; + case(YON_TYPE_BOOLEAN):(this->__setupBalancedFlag(data_container, meta_container, pattern_matches)); break; + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)data_container.header.data_header.controller.type << std::endl; return; } } @@ -192,51 +198,67 @@ template InfoContainer::InfoContainer(const data_container_type& container) : __containers(nullptr) { - if(container.buffer_data_uncompressed.size() == 0) + if(container.buffer_data_uncompressed.size() == 0 && container.header.data_header.GetPrimitiveType() != YON_TYPE_BOOLEAN) return; - if(container.header.data_header.hasMixedStride()){ - if(container.header.data_header.isSigned()){ - switch(container.header.data_header.getPrimitiveType()){ + if(container.header.data_header.HasMixedStride()){ + if(container.header.data_header.IsSigned()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container)); break; case(YON_TYPE_16B): (this->__setup(container)); break; case(YON_TYPE_32B): (this->__setup(container)); break; case(YON_TYPE_64B): (this->__setup(container)); break; case(YON_TYPE_FLOAT): (this->__setup(container)); break; case(YON_TYPE_DOUBLE): (this->__setup(container)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)container.header.data_header.controller.type << std::endl; return; } } else { - switch(container.header.data_header.getPrimitiveType()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container)); break; case(YON_TYPE_16B): (this->__setup(container)); break; case(YON_TYPE_32B): (this->__setup(container)); break; case(YON_TYPE_64B): (this->__setup(container)); break; case(YON_TYPE_FLOAT): (this->__setup(container)); break; case(YON_TYPE_DOUBLE): (this->__setup(container)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)container.header.data_header.controller.type << std::endl; return; } } } else { - if(container.header.data_header.isSigned()){ - switch(container.header.data_header.getPrimitiveType()){ + if(container.header.data_header.IsSigned()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_16B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_32B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_64B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_FLOAT): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, container.header.data_header.stride)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)container.header.data_header.controller.type << std::endl; return; } } else { - switch(container.header.data_header.getPrimitiveType()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_16B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_32B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_64B): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_FLOAT): (this->__setup(container, container.header.data_header.stride)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, container.header.data_header.stride)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed type: " << (int)container.header.data_header.controller.type << std::endl; return; } @@ -298,15 +320,15 @@ void InfoContainer::__setupBalanced(const data_container_type& data stride_container_type strides(data_container); U32 current_offset = 0; - U32 stride_offset = 0; + U32 stride_offset = 0; for(U32 i = 0; i < this->size(); ++i){ // There are no INFO fields - if(meta_container[i].getInfoPatternID() == -1){ + if(meta_container[i].GetInfoPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getInfoPatternID()]){ + else if(pattern_matches[meta_container[i].GetInfoPatternId()]){ new( &this->__containers[i] ) value_type( data_container, current_offset, strides[stride_offset] ); current_offset += strides[stride_offset] * sizeof(actual_primitive); ++stride_offset; @@ -316,7 +338,39 @@ void InfoContainer::__setupBalanced(const data_container_type& data new( &this->__containers[i] ) value_type( ); } } + assert(current_offset == data_container.buffer_data_uncompressed.size()); + assert(stride_offset == strides.size()); +} + +template +void InfoContainer::__setupBalancedFlag(const data_container_type& data_container, + const meta_container_type& meta_container, + const std::vector& pattern_matches) +{ + this->n_entries = meta_container.size(); + std::cerr << "in flag ctor info: " << this->size() << std::endl; + if(this->size() == 0) + return; + + this->__containers = static_cast(::operator new[](this->size()*sizeof(value_type))); + + for(U32 i = 0; i < this->size(); ++i){ + // There are no INFO fields + if(meta_container[i].GetInfoPatternId() == -1){ + new( &this->__containers[i] ) value_type( false ); + } + // If pattern matches + else if(pattern_matches[meta_container[i].GetInfoPatternId()]){ + std::cerr << "match add true: " << i << std::endl; + new( &this->__containers[i] ) value_type( true ); + } + // Otherwise place an empty + else { + new( &this->__containers[i] ) value_type( false ); + } + } + } template @@ -353,14 +407,14 @@ void InfoContainer::__setupBalanced(const data_container_type& data U32 current_offset = 0; // Case 1: if data is uniform - if(data_container.header.data_header.isUniform()){ + if(data_container.header.data_header.IsUniform()){ for(U32 i = 0; i < this->size(); ++i){ // There are no INFO fields - if(meta_container[i].getInfoPatternID() == -1){ + if(meta_container[i].GetInfoPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getInfoPatternID()]){ + else if(pattern_matches[meta_container[i].GetInfoPatternId()]){ new( &this->__containers[i] ) value_type( data_container, 0, stride_size ); } else { new( &this->__containers[i] ) value_type( ); @@ -372,7 +426,7 @@ void InfoContainer::__setupBalanced(const data_container_type& data else { for(U32 i = 0; i < this->size(); ++i){ // If pattern matches - if(pattern_matches[meta_container[i].getInfoPatternID()]){ + if(pattern_matches[meta_container[i].GetInfoPatternId()]){ new( &this->__containers[i] ) value_type( data_container, current_offset, stride_size ); current_offset += stride_size * sizeof(actual_primitive); } diff --git a/lib/containers/info_container_interface.h b/lib/containers/info_container_interface.h index 806f215..d863033 100644 --- a/lib/containers/info_container_interface.h +++ b/lib/containers/info_container_interface.h @@ -15,14 +15,14 @@ class InfoContainerInterface{ virtual ~InfoContainerInterface(){} // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } inline const size_type& capacity(void) const{ return(this->n_capacity); } virtual std::ostream& to_vcf_string(std::ostream& stream, const U32 position) const =0; virtual io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position) const =0; virtual io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const U32 position) const =0; - virtual const bool emptyPosition(const U32& position) const =0; + virtual bool emptyPosition(const U32& position) const =0; protected: TACHYON_CORE_TYPE primitive_type; diff --git a/lib/containers/info_container_string.cpp b/lib/containers/info_container_string.cpp index 857af57..d25b24c 100644 --- a/lib/containers/info_container_string.cpp +++ b/lib/containers/info_container_string.cpp @@ -12,24 +12,24 @@ InfoContainer::InfoContainer() : InfoContainer::InfoContainer(const data_container_type& container) : __containers(nullptr) { -if(container.header.data_header.hasMixedStride()) - this->__setup(container); -else - this->__setup(container, container.header.data_header.stride); + if(container.header.data_header.HasMixedStride()) + this->__setup(container); + else + this->__setup(container, container.header.data_header.stride); } InfoContainer::InfoContainer(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches) : __containers(nullptr) { -if(data_container.header.data_header.hasMixedStride()) - this->__setupBalanced(data_container, meta_container, pattern_matches); -else - this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride); + if(data_container.header.data_header.HasMixedStride()) + this->__setupBalanced(data_container, meta_container, pattern_matches); + else + this->__setupBalanced(data_container, meta_container, pattern_matches, data_container.header.data_header.stride); } InfoContainer::~InfoContainer(void){ for(std::size_t i = 0; i < this->n_entries; ++i) - ((this->__containers + i)->~basic_string)(); + ((this->__containers + i)->~PrimitiveContainer)(); ::operator delete[](static_cast(this->__containers)); } @@ -69,11 +69,11 @@ void InfoContainer::__setupBalanced(const data_container_type& data for(U32 i = 0; i < this->size(); ++i){ // Meta entry has no INFO - if(meta_container[i].getInfoPatternID() == -1){ + if(meta_container[i].GetInfoPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getInfoPatternID()]){ + else if(pattern_matches[meta_container[i].GetInfoPatternId()]){ new( &this->__containers[i] ) value_type(&data_container.buffer_data_uncompressed.data()[current_offset], strides[stride_offset]); current_offset += strides[stride_offset]; ++stride_offset; @@ -110,14 +110,14 @@ void InfoContainer::__setupBalanced(const data_container_type& data this->__containers = static_cast(::operator new[](this->n_entries*sizeof(value_type))); - if(data_container.header.data_header.isUniform() == false){ + if(data_container.header.data_header.IsUniform() == false){ U32 current_offset = 0; for(U32 i = 0; i < this->n_entries; ++i){ // If there are no INFO fields - if(meta_container[i].getInfoPatternID() == -1){ + if(meta_container[i].GetInfoPatternId() == -1){ new( &this->__containers[i] ) value_type( ); } // If pattern matches - else if(pattern_matches[meta_container[i].getInfoPatternID()]){ + else if(pattern_matches[meta_container[i].GetInfoPatternId()]){ new( &this->__containers[i] ) value_type(&data_container.buffer_data_uncompressed.data()[current_offset], stride_size); current_offset += stride_size; } @@ -132,7 +132,7 @@ void InfoContainer::__setupBalanced(const data_container_type& data else { for(U32 i = 0; i < this->n_entries; ++i){ // If pattern matches - if(pattern_matches[meta_container[i].getInfoPatternID()]){ + if(pattern_matches[meta_container[i].GetInfoPatternId()]){ new( &this->__containers[i] ) value_type(data_container.buffer_data_uncompressed.data(), stride_size); } // Otherwise place an empty diff --git a/lib/containers/info_container_string.h b/lib/containers/info_container_string.h index 4eaacf9..22c77a4 100644 --- a/lib/containers/info_container_string.h +++ b/lib/containers/info_container_string.h @@ -15,7 +15,7 @@ template <> class InfoContainer : public InfoContainerInterface{ private: typedef InfoContainer self_type; - typedef std::string value_type; + typedef PrimitiveContainer value_type; typedef value_type& reference; typedef const value_type& const_reference; typedef value_type* pointer; @@ -27,46 +27,15 @@ class InfoContainer : public InfoContainerInterface{ typedef MetaContainer meta_container_type; typedef StrideContainer stride_container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: InfoContainer(); InfoContainer(const data_container_type& container); InfoContainer(const data_container_type& data_container, const meta_container_type& meta_container, const std::vector& pattern_matches); ~InfoContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__containers[position]); } inline const_reference at(const size_type& position) const{ return(this->__containers[position]); } @@ -80,7 +49,7 @@ class InfoContainer : public InfoContainerInterface{ inline const_reference back(void) const{ return(this->__containers[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } // Iterator @@ -91,17 +60,18 @@ class InfoContainer : public InfoContainerInterface{ inline const_iterator cbegin() const{ return const_iterator(&this->__containers[0]); } inline const_iterator cend() const{ return const_iterator(&this->__containers[this->n_entries]); } - inline std::ostream& to_vcf_string(std::ostream& stream, const U32 position) const{ return(stream << this->at(position)); } - inline io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position) const{ buffer += this->at(position); return(buffer); } + inline std::ostream& to_vcf_string(std::ostream& stream, const U32 position) const{ return(stream << this->at(position).data_); } + inline io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32 position) const{ buffer += this->at(position).data_; return(buffer); } inline io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const U32 position) const{ if(this->at(position).size() == 0){ buffer += "null"; return(buffer); } - buffer += '"'; buffer += this->at(position); buffer += '"'; + buffer += '"'; buffer += this->at(position).data_; buffer += '"'; return(buffer); } - const bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } + + bool emptyPosition(const U32& position) const{ return(this->at(position).empty()); } private: // For mixed strides diff --git a/lib/containers/interval_container.cpp b/lib/containers/interval_container.cpp index 2539668..8e43b5e 100644 --- a/lib/containers/interval_container.cpp +++ b/lib/containers/interval_container.cpp @@ -19,7 +19,7 @@ IntervalContainer::~IntervalContainer(void){ } // Interpret -bool IntervalContainer::validateIntervalStrings(std::vector& interval_strings){ +bool IntervalContainer::ValidateIntervalStrings(std::vector& interval_strings){ if(interval_strings.size() == 0) return true; @@ -43,91 +43,94 @@ bool IntervalContainer::validateIntervalStrings(std::vector& interv return true; } -bool IntervalContainer::parseIntervals(std::vector& interval_strings, const header_type& header, const index_type& index){ +bool IntervalContainer::ParseIntervals(std::vector& interval_strings, const header_type& header, const index_type& index){ // Intervals pass expression tests - if(this->validateIntervalStrings(interval_strings) == false) - return(false); - // No intervals to parse if(interval_strings.size() == 0) - return true; + return(true); + + if(this->ValidateIntervalStrings(interval_strings) == false) + return(false); // Append given interval strings to internal vector of strings this->interval_strings_.insert( this->interval_strings_.end(), interval_strings.begin(), interval_strings.end() ); - // Assert that interval list data is of length n_contigs_; - if(this->interval_list_.size() != header.getContigNumber()) - this->interval_list_ = std::vector< std::vector< interval_type > >(header.getContigNumber()); - + // Assert that interval list data is of length n_contigs_ + // Note that this will truncate previous entries if resizing occurs + if(this->interval_list_.size() != header.GetNumberContigs()) + this->interval_list_ = std::vector< std::vector< interval_type > >(header.GetNumberContigs()); // Parse each interval for(U32 i = 0; i < interval_strings.size(); ++i){ interval_strings[i] = utility::remove_whitespace(interval_strings[i]); - core::HeaderContig* contig = nullptr; + const YonContig* contig = nullptr; // Chromosome only if (std::regex_match (interval_strings[i], constants::YON_REGEX_CONTIG_ONLY )){ - std::cerr << "chromosome only" << std::endl; - if(!header.getContig(interval_strings[i],contig)){ - std::cerr << "cant find contig: " << interval_strings[i] << std::endl; + //std::cerr << "chromosome only" << std::endl; + contig = header.GetContig(interval_strings[i]); + if(contig == nullptr){ + std::cerr << utility::timestamp("ERROR") << "Contig not defined in file: " << interval_strings[i] << std::endl; return(false); } - std::cerr << "Parsed: " << interval_strings[i] << std::endl; - this->interval_list_[contig->contigID].push_back(interval_type(0, contig->bp_length, contig->contigID)); - + //std::cerr << "Parsed: " << interval_strings[i] << " -> " << contig->name << ":" << contig->idx << std::endl; + std::vector target_blocks = index.findOverlap(contig->idx); + this->block_list_.insert( this->block_list_.end(), target_blocks.begin(), target_blocks.end() ); + this->interval_list_[contig->idx].push_back(interval_type(0, contig->n_bases, contig->idx)); } // Chromosome:position else if (std::regex_match (interval_strings[i], constants::YON_REGEX_CONTIG_POSITION )){ - std::cerr << "chromosome pos" << std::endl; + //std::cerr << "chromosome pos" << std::endl; std::vector substrings = utility::split(interval_strings[i], ':'); if(substrings[0].size() == 0 || substrings[1].size() == 0){ - std::cerr << "illegal form" << std::endl; + std::cerr << utility::timestamp("ERROR") << "Illegal form: " << interval_strings[i] << std::endl; return false; } - if(!header.getContig(substrings[0],contig)){ - std::cerr << "cant find contig: " << substrings[0] << std::endl; + contig = header.GetContig(substrings[0]); + if(contig == nullptr){ + std::cerr << utility::timestamp("ERROR") << "Contig not defined in file: " << interval_strings[i] << std::endl; return(false); } U64 position = atof(substrings[1].data()); - std::cerr << "Parsed: " << substrings[0] << "," << position << std::endl; + //std::cerr << "Parsed: " << substrings[0] << "," << position << std::endl; - std::vector target_blocks = index.findOverlap(contig->contigID, position); + std::vector target_blocks = index.findOverlap(contig->idx, position); + //std::cerr << "overlaps: " << target_blocks.size() << std::endl; this->block_list_.insert( this->block_list_.end(), target_blocks.begin(), target_blocks.end() ); - this->interval_list_[contig->contigID].push_back(interval_type(position, position, contig->contigID)); - - + this->interval_list_[contig->idx].push_back(interval_type(position, position, contig->idx)); } // Chromosome:position-position else if (std::regex_match (interval_strings[i], constants::YON_REGEX_CONTIG_RANGE )){ - std::cerr << "chromosome pos - pos" << std::endl; + //std::cerr << "chromosome pos - pos" << std::endl; std::vector substrings = utility::split(interval_strings[i], ':'); if(substrings[0].size() == 0 || substrings[1].size() == 0){ - std::cerr << "illegal form" << std::endl; + std::cerr << utility::timestamp("ERROR") << "Illegal form: " << std::endl; return false; } - if(!header.getContig(substrings[0],contig)){ - std::cerr << "cant find contig: " << substrings[0] << std::endl; + contig = header.GetContig(substrings[0]); + if(contig == nullptr){ + std::cerr << utility::timestamp("ERROR") << "Contig not defined in file: " << substrings[0] << std::endl; return(false); } std::vector position_strings = utility::split(substrings[1], '-'); if(position_strings[0].size() == 0 || position_strings[1].size() == 0){ - std::cerr << "illegal form" << std::endl; + std::cerr << utility::timestamp("ERROR") << "Illegal form: " << std::endl; return false; } U64 position_from = atof(position_strings[0].data()); U64 position_to = atof(position_strings[1].data()); if(position_from > position_to) std::swap(position_from, position_to); - std::cerr << "Parsed: " << substrings[0] << "," << position_from << "," << position_to << std::endl; + //std::cerr << "Parsed: " << substrings[0] << "," << position_from << "," << position_to << std::endl; - std::vector target_blocks = index.findOverlap(contig->contigID, position_from, position_to); + std::vector target_blocks = index.findOverlap(contig->idx, position_from, position_to); this->block_list_.insert( this->block_list_.end(), target_blocks.begin(), target_blocks.end() ); - this->interval_list_[contig->contigID].push_back(interval_type(position_from, position_to, contig->contigID)); + this->interval_list_[contig->idx].push_back(interval_type(position_from, position_to, contig->idx)); } else { std::cerr << utility::timestamp("ERROR") << "Uninterpretable interval string: " << interval_strings[i] << std::endl; @@ -137,18 +140,23 @@ bool IntervalContainer::parseIntervals(std::vector& interval_string } if(this->block_list_.size() == 0) - return true; + return(false); + + // Ascertain data is sorted according to YON blockID + // This has the implication that the traversal of a sliced sorted file + // will be sorted as output + std::sort(this->block_list_.begin(), this->block_list_.end()); return true; } -bool IntervalContainer::build(const header_type& header){ +bool IntervalContainer::Build(const header_type& header){ if(this->interval_list_.size() == 0) return true; // Dedupe blocks before building - this->dedupeBlockList(); + this->DedupeBlockList(); - this->n_entries_ = header.getContigNumber(); + this->n_entries_ = header.GetNumberContigs(); this->__entries = static_cast(::operator new[](this->n_entries_*sizeof(value_type))); for(U32 i = 0; i < this->n_entries_; ++i){ new( &this->__entries[i] ) value_type( std::move(this->interval_list_[i]) ); @@ -156,7 +164,7 @@ bool IntervalContainer::build(const header_type& header){ return true; } -void IntervalContainer::dedupeBlockList(void){ +void IntervalContainer::DedupeBlockList(void){ if(this->block_list_.size() == 0) return; // Dedupe diff --git a/lib/containers/interval_container.h b/lib/containers/interval_container.h index 06f5096..49b4f47 100644 --- a/lib/containers/interval_container.h +++ b/lib/containers/interval_container.h @@ -10,12 +10,13 @@ #include "index/index.h" #include "core/header/variant_header.h" #include "core/meta_entry.h" +#include "components/generic_iterator.h" namespace tachyon{ namespace containers{ class IntervalContainer { -private: +public: typedef IntervalContainer self_type; typedef std::size_t size_type; typedef algorithm::Interval interval_type; @@ -26,47 +27,16 @@ class IntervalContainer { typedef const value_type* const_pointer; typedef index::Index index_type; typedef index::IndexEntry index_entry_type; - typedef core::VariantHeader header_type; + typedef VariantHeader header_type; typedef core::MetaEntry meta_entry_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: IntervalContainer(); ~IntervalContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__entries[position]); } inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } @@ -80,7 +50,7 @@ class IntervalContainer { inline const_reference back(void) const{ return(this->__entries[this->n_entries_ - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries_ == 0); } + inline bool empty(void) const{ return(this->n_entries_ == 0); } inline const size_type& size(void) const{ return(this->n_entries_); } // Iterator @@ -91,13 +61,13 @@ class IntervalContainer { inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries_]); } - inline const bool hasBlockList(void) const{ return(this->block_list_.size()); } - inline std::vector& getBlockList(void){ return(this->block_list_); } - inline const std::vector& getBlockList(void) const{ return(this->block_list_); } - inline const bool hasIntervals(void) const{ return(this->interval_list_.size()); } + inline bool HasBlockList(void) const{ return(this->block_list_.size()); } + inline std::vector& GetBlockList(void){ return(this->block_list_); } + inline const std::vector& GetBlockList(void) const{ return(this->block_list_); } + inline bool HasIntervals(void) const{ return(this->interval_list_.size()); } // Interpret - bool validateIntervalStrings(std::vector& interval_strings); + bool ValidateIntervalStrings(std::vector& interval_strings); /**< * Parse interval strings. These strings have to match the regular expression @@ -105,22 +75,22 @@ class IntervalContainer { * YON_REGEX_CONTIG_ONLY, YON_REGEX_CONTIG_POSITION, or YON_REGEX_CONTIG_RANGE * @return Returns TRUE if successful or FALSE otherwise */ - bool parseIntervals(std::vector& interval_strings, const header_type& header, const index_type& index); + bool ParseIntervals(std::vector& interval_strings, const header_type& header, const index_type& index); - bool build(const header_type& header); + bool Build(const header_type& header); - inline std::vector find_overlaps(const U32& contigID, const S64& start_position, const S64& end_position) const{ + inline std::vector FindOverlaps(const U32& contigID, const S64& start_position, const S64& end_position) const{ if(contigID > this->size()) return(std::vector()); return(this->at(contigID).findOverlapping(start_position, end_position)); } - inline std::vector find_overlaps(const meta_entry_type& meta_entry) const{ + inline std::vector FindOverlaps(const meta_entry_type& meta_entry) const{ if(meta_entry.contigID > this->size()) return(std::vector()); return(this->at(meta_entry.contigID).findOverlapping(meta_entry.position, meta_entry.position + 1)); } private: - void dedupeBlockList(void); + void DedupeBlockList(void); private: U32 n_intervals_; diff --git a/lib/containers/meta_container.cpp b/lib/containers/meta_container.cpp index 0ca43c3..5b5e619 100644 --- a/lib/containers/meta_container.cpp +++ b/lib/containers/meta_container.cpp @@ -22,14 +22,15 @@ MetaContainer::~MetaContainer(void){ void MetaContainer::__ctor_setup(const block_type& block){ // Build containers for hot/cold depending on what is available - PrimitiveContainer contigs(block.meta_contig_container); - PrimitiveContainer controllers(block.meta_controller_container); - PrimitiveContainer positions(block.meta_positions_container); - PrimitiveContainer quality(block.meta_quality_container); - PrimitiveContainer refalt(block.meta_refalt_container); - PrimitiveContainer filterID(block.meta_filter_map_ids); - PrimitiveContainer formatID(block.meta_format_map_ids); - PrimitiveContainer infoID(block.meta_info_map_ids); + PrimitiveContainer contigs(block.base_containers[YON_BLK_CONTIG]); + PrimitiveContainer controllers(block.base_containers[YON_BLK_CONTROLLER]); + PrimitiveContainer positions(block.base_containers[YON_BLK_POSITION]); + PrimitiveContainer quality(block.base_containers[YON_BLK_QUALITY]); + PrimitiveContainer refalt(block.base_containers[YON_BLK_REFALT]); + PrimitiveContainer filterID(block.base_containers[YON_BLK_ID_FILTER]); + PrimitiveContainer formatID(block.base_containers[YON_BLK_ID_FORMAT]); + PrimitiveContainer infoID(block.base_containers[YON_BLK_ID_INFO]); + PrimitiveContainer ploidy(block.base_containers[YON_BLK_GT_PLOIDY]); for(U32 i = 0; i < this->size(); ++i){ new( &this->__entries[i] ) value_type( ); @@ -125,8 +126,8 @@ void MetaContainer::__ctor_setup(const block_type& block){ assert(refalt_position == refalt.size()); } - if(block.meta_alleles_container.buffer_data_uncompressed.size()){ - StrideContainer strides(block.meta_alleles_container); + if(block.base_containers[YON_BLK_ALLELES].buffer_data_uncompressed.size()){ + StrideContainer strides(block.base_containers[YON_BLK_ALLELES]); U32 offset = 0; U32 stride_offset = 0; for(U32 i = 0; i < this->size(); ++i){ @@ -135,26 +136,41 @@ void MetaContainer::__ctor_setup(const block_type& block){ this->__entries[i].alleles = static_cast(::operator new[](strides[stride_offset]*sizeof(value_type::allele_type))); for(U32 j = 0; j < strides[stride_offset]; ++j){ - const U16& l_string = *reinterpret_cast(&block.meta_alleles_container.buffer_data_uncompressed[offset]); - new( &this->__entries[i].alleles[j] ) value_type::allele_type( &block.meta_alleles_container.buffer_data_uncompressed[offset] ); + const U16& l_string = *reinterpret_cast(&block.base_containers[YON_BLK_ALLELES].buffer_data_uncompressed[offset]); + new( &this->__entries[i].alleles[j] ) value_type::allele_type( &block.base_containers[YON_BLK_ALLELES].buffer_data_uncompressed[offset] ); offset += sizeof(U16) + l_string; } ++stride_offset; } } - assert(offset == block.meta_alleles_container.getSizeUncompressed()); + assert(offset == block.base_containers[YON_BLK_ALLELES].GetSizeUncompressed()); } // Parse name - if(block.meta_names_container.getSizeUncompressed()){ - StrideContainer strides(block.meta_names_container); + if(block.base_containers[YON_BLK_NAMES].GetSizeUncompressed()){ + StrideContainer strides(block.base_containers[YON_BLK_NAMES]); U32 offset = 0; assert(strides.size() == this->size()); for(U32 i = 0; i < this->size(); ++i){ - this->__entries[i].name = std::string(&block.meta_names_container.buffer_data_uncompressed.data()[offset], strides[i]); + this->__entries[i].name = std::string(&block.base_containers[YON_BLK_NAMES].buffer_data_uncompressed.data()[offset], strides[i]); offset += strides[i]; } } + + // Parse ploidy. + if(ploidy.size()){ + if(ploidy.isUniform()){ + for(U32 i = 0; i < this->size(); ++i){ + this->at(i).n_base_ploidy = ploidy[0]; + } + } else { + assert(ploidy.size() == this->size()); + for(U32 i = 0; i < this->size(); ++i){ + this->at(i).n_base_ploidy = ploidy[i]; + std::cerr << (int)ploidy[i] << std::endl; + } + } + } } } diff --git a/lib/containers/meta_container.h b/lib/containers/meta_container.h index d044fb0..c52f49d 100644 --- a/lib/containers/meta_container.h +++ b/lib/containers/meta_container.h @@ -3,6 +3,7 @@ #include "variant_block.h" #include "core/meta_entry.h" +#include "components/generic_iterator.h" namespace tachyon{ namespace containers{ @@ -19,44 +20,13 @@ class MetaContainer { typedef VariantBlock block_type; typedef VariantBlockHeader block_header_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: MetaContainer(const block_type& block); ~MetaContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__entries[position]); } inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } @@ -70,7 +40,7 @@ class MetaContainer { inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } // Iterator diff --git a/lib/containers/primitive_container.h b/lib/containers/primitive_container.h index 6c1b42e..bf6f3d8 100644 --- a/lib/containers/primitive_container.h +++ b/lib/containers/primitive_container.h @@ -3,14 +3,40 @@ #include +#include "components/generic_iterator.h" #include "variant_block.h" #include "math/summary_statistics.h" +#include "utility/support_vcf.h" namespace tachyon{ namespace containers{ +class PrimitiveContainerInterface { +public: + typedef PrimitiveContainerInterface self_type; + typedef std::size_t size_type; + +public: + PrimitiveContainerInterface(void) : is_uniform_(false), n_entries_(0){} + PrimitiveContainerInterface(const bool uniform, const size_t size) : is_uniform_(uniform), n_entries_(size){} + virtual ~PrimitiveContainerInterface(){} + + // Capacity + inline bool empty(void) const{ return(this->n_entries_ == 0); } + inline const size_type& size(void) const{ return(this->n_entries_); } + inline bool isUniform(void) const{ return(this->is_uniform_); } + + virtual io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer) const =0; + + virtual bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const =0; + +protected: + bool is_uniform_; + size_t n_entries_; +}; + template -class PrimitiveContainer{ +class PrimitiveContainer : public PrimitiveContainerInterface { private: typedef std::size_t size_type; typedef return_type value_type; @@ -21,46 +47,16 @@ class PrimitiveContainer{ typedef std::ptrdiff_t difference_type; typedef DataContainer container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: PrimitiveContainer(); + PrimitiveContainer(const return_type value); PrimitiveContainer(const container_type& container); PrimitiveContainer(const container_type& container, const U32& offset, const U32 n_entries); ~PrimitiveContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__entries[position]); } inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } @@ -70,21 +66,26 @@ class PrimitiveContainer{ inline const_pointer data(void) const{ return(this->__entries); } inline reference front(void){ return(this->__entries[0]); } inline const_reference front(void) const{ return(this->__entries[0]); } - inline reference back(void){ return(this->__entries[this->n_entries - 1]); } - inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } + inline reference back(void){ return(this->__entries[this->n_entries_ - 1]); } + inline const_reference back(void) const{ return(this->__entries[this->n_entries_ - 1]); } - // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } - inline const size_type& size(void) const{ return(this->n_entries); } - inline const bool isUniform(void) const{ return(this->__uniform); } // Iterator inline iterator begin(){ return iterator(&this->__entries[0]); } - inline iterator end(){ return iterator(&this->__entries[this->n_entries]); } + inline iterator end(){ return iterator(&this->__entries[this->n_entries_]); } inline const_iterator begin() const{ return const_iterator(&this->__entries[0]); } - inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries]); } + inline const_iterator end() const{ return const_iterator(&this->__entries[this->n_entries_]); } inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } - inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries]); } + inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries_]); } + + io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer) const{ + utility::to_vcf_string(buffer, this->data(), this->size()); + return(buffer); + } + + bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + return(utility::UpdateHtslibVcfRecordInfo(rec, hdr, tag, this->data(), this->size())); + } private: template @@ -94,62 +95,130 @@ class PrimitiveContainer{ void __setupSigned(const container_type& container, const U32& offset); private: - bool __uniform; - size_t n_entries; pointer __entries; }; +template <> +class PrimitiveContainer : public PrimitiveContainerInterface{ +public: + typedef PrimitiveContainer self_type; + typedef std::string value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + +public: + PrimitiveContainer(){} + PrimitiveContainer(const char* data, const size_t l_data) : + PrimitiveContainerInterface(false, l_data), + data_(data, l_data) + {} + ~PrimitiveContainer(void){} + + // Element access + inline pointer data(void){ return(&this->data_); } + inline const_pointer data(void) const{ return(&this->data_); } + const bool empty(void) const{ return(this->data_.size() == 0); } + + // Iterator + inline iterator begin(){ return iterator(&this->data_); } + inline iterator end(){ return iterator(&this->data_ + this->n_entries_); } + inline const_iterator begin() const{ return const_iterator(&this->data_); } + inline const_iterator end() const{ return const_iterator(&this->data_ + this->n_entries_); } + inline const_iterator cbegin() const{ return const_iterator(&this->data_); } + inline const_iterator cend() const{ return const_iterator(&this->data_ + this->n_entries_); } + + io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer) const{ + if(this->data_.size() == 0){ + buffer += '.'; + return(buffer); + } + buffer += this->data_; + return(buffer); + } + + bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + return(utility::UpdateHtslibVcfRecordInfo(rec, hdr, tag, this->data_)); + } + +public: + std::string data_; +}; + // IMPLEMENTATION ------------------------------------------------------------- template PrimitiveContainer::PrimitiveContainer() : - __uniform(false), - n_entries(0), __entries(nullptr) { } + +template +PrimitiveContainer::PrimitiveContainer(const return_type value) : + __entries(new return_type[1]) +{ + this->n_entries_ = 1; + this->__entries[0] = value; +} + template PrimitiveContainer::PrimitiveContainer(const container_type& container) : - __uniform(false), - n_entries(0), __entries(nullptr) { - if(container.header.data_header.getPrimitiveWidth() == -1) + if(container.header.data_header.GetPrimitiveWidth() == -1) return; - assert(container.buffer_data_uncompressed.size() % container.header.data_header.getPrimitiveWidth() == 0); + assert(container.buffer_data_uncompressed.size() % container.header.data_header.GetPrimitiveWidth() == 0); - this->n_entries = container.buffer_data_uncompressed.size() / container.header.data_header.getPrimitiveWidth(); - this->__entries = new value_type[this->n_entries]; + this->n_entries_ = container.buffer_data_uncompressed.size() / container.header.data_header.GetPrimitiveWidth(); + this->__entries = new value_type[this->n_entries_]; - if(this->n_entries == 0) + if(this->n_entries_ == 0) return; - this->__uniform = container.header.data_header.isUniform(); + this->is_uniform_ = container.header.data_header.IsUniform(); - if(container.header.data_header.isSigned()){ - switch(container.header.data_header.getPrimitiveType()){ + if(container.header.data_header.IsSigned()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupSigned(container, 0)); break; case(YON_TYPE_16B): (this->__setupSigned(container, 0)); break; case(YON_TYPE_32B): (this->__setupSigned(container, 0)); break; case(YON_TYPE_64B): (this->__setupSigned(container, 0)); break; case(YON_TYPE_FLOAT): (this->__setup(container, 0)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, 0)); break; - default: std::cerr << "Disallowed: " << container.header.data_header.getPrimitiveType() << std::endl; return; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): + default: + std::cerr << "Disallowed: " << container.header.data_header.GetPrimitiveType() << std::endl; + return; } } else { - switch(container.header.data_header.getPrimitiveType()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container, 0)); break; case(YON_TYPE_16B): (this->__setup(container, 0)); break; case(YON_TYPE_32B): (this->__setup(container, 0)); break; case(YON_TYPE_64B): (this->__setup(container, 0)); break; case(YON_TYPE_FLOAT): (this->__setup(container, 0)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, 0)); break; - default: std::cerr << "Disallowed: " << container.header.data_header.getPrimitiveType() << std::endl; return; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): + default: + std::cerr << "Disallowed: " << container.header.data_header.GetPrimitiveType() << std::endl; + return; } } } @@ -158,28 +227,35 @@ template PrimitiveContainer::PrimitiveContainer(const container_type& container, const U32& offset, const U32 n_entries) : - __uniform(false), - n_entries(n_entries), + PrimitiveContainerInterface(false, n_entries), __entries(new value_type[n_entries]) { - if(container.header.data_header.isSigned()){ - switch(container.header.data_header.getPrimitiveType()){ + if(container.header.data_header.IsSigned()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setupSigned(container, offset)); break; case(YON_TYPE_16B): (this->__setupSigned(container, offset)); break; case(YON_TYPE_32B): (this->__setupSigned(container, offset)); break; case(YON_TYPE_64B): (this->__setupSigned(container, offset)); break; case(YON_TYPE_FLOAT): (this->__setup(container, offset)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, offset)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed" << std::endl; return; } } else { - switch(container.header.data_header.getPrimitiveType()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container, offset)); break; case(YON_TYPE_16B): (this->__setup(container, offset)); break; case(YON_TYPE_32B): (this->__setup(container, offset)); break; case(YON_TYPE_64B): (this->__setup(container, offset)); break; case(YON_TYPE_FLOAT): (this->__setup(container, offset)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, offset)); break; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << "Disallowed" << std::endl; return; } } diff --git a/lib/containers/primitive_group_container.h b/lib/containers/primitive_group_container.h index 5454a69..7b1fd7c 100644 --- a/lib/containers/primitive_group_container.h +++ b/lib/containers/primitive_group_container.h @@ -1,14 +1,39 @@ #ifndef CONTAINERS_PRIMITIVE_GROUP_CONTAINER_H_ #define CONTAINERS_PRIMITIVE_GROUP_CONTAINER_H_ +#include "components/generic_iterator.h" #include "primitive_container.h" namespace tachyon{ namespace containers{ +class PrimitiveGroupContainerInterface { +public: + typedef PrimitiveGroupContainerInterface self_type; + typedef std::size_t size_type; + +public: + PrimitiveGroupContainerInterface() : n_objects_(0){ } + PrimitiveGroupContainerInterface(const size_type n_objects) : n_objects_(n_objects){ } + virtual ~PrimitiveGroupContainerInterface(){ } + + virtual io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const uint64_t position) const =0; + virtual bcf1_t* UpdateHtslibVcfRecordFormatInt32(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const =0; + virtual bcf1_t* UpdateHtslibVcfRecordFormatFloat(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const =0; + virtual bcf1_t* UpdateHtslibVcfRecordFormatString(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const =0; + + // Capacity + inline bool empty(void) const{ return(this->n_objects_ == 0); } + inline const size_type& size(void) const{ return(this->n_objects_); } + + +public: + size_type n_objects_; +}; + template -class PrimitiveGroupContainer{ -private: +class PrimitiveGroupContainer : public PrimitiveGroupContainerInterface { +public: typedef PrimitiveGroupContainer self_type; typedef PrimitiveContainer value_type; typedef std::size_t size_type; @@ -19,45 +44,14 @@ class PrimitiveGroupContainer{ typedef std::ptrdiff_t difference_type; typedef DataContainer data_container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: PrimitiveGroupContainer(); PrimitiveGroupContainer(const data_container_type& container, const U32& offset, const U32 n_objects, const U32 strides_each); ~PrimitiveGroupContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__containers[position]); } inline const_reference at(const size_type& position) const{ return(this->__containers[position]); } @@ -67,27 +61,69 @@ class PrimitiveGroupContainer{ inline const_pointer data(void) const{ return(this->__containers); } inline reference front(void){ return(this->__containers[0]); } inline const_reference front(void) const{ return(this->__containers[0]); } - inline reference back(void){ return(this->__containers[this->__n_objects - 1]); } - inline const_reference back(void) const{ return(this->__containers[this->__n_objects - 1]); } - - // Capacity - inline const bool empty(void) const{ return(this->__n_objects == 0); } - inline const size_type& size(void) const{ return(this->__n_objects); } + inline reference back(void){ return(this->__containers[this->n_objects_ - 1]); } + inline const_reference back(void) const{ return(this->__containers[this->n_objects_ - 1]); } // Iterator inline iterator begin(){ return iterator(&this->__containers[0]); } - inline iterator end(){ return iterator(&this->__containers[this->__n_objects]); } + inline iterator end(){ return iterator(&this->__containers[this->n_objects_]); } inline const_iterator begin() const{ return const_iterator(&this->__containers[0]); } - inline const_iterator end() const{ return const_iterator(&this->__containers[this->__n_objects]); } + inline const_iterator end() const{ return const_iterator(&this->__containers[this->n_objects_]); } inline const_iterator cbegin() const{ return const_iterator(&this->__containers[0]); } - inline const_iterator cend() const{ return const_iterator(&this->__containers[this->__n_objects]); } + inline const_iterator cend() const{ return const_iterator(&this->__containers[this->n_objects_]); } + + io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const uint64_t position) const{ + utility::to_vcf_string(buffer, this->at(position).data(), this->at(position).size()); + return(buffer); + } + + bcf1_t* UpdateHtslibVcfRecordFormatInt32(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + uint32_t n_records = 0; + for(U32 i = 0; i < this->size(); ++i) + n_records += this->at(i).size(); + + int32_t* dst = new int32_t[n_records]; + uint32_t n_offset = 0; + for(U32 i = 0; i < this->size(); ++i){ + utility::FormatDataHtslib(this->at(i).data(), &dst[n_offset], this->at(i).size()); + n_offset += this->at(i).size(); + } + + bcf_update_format_int32(hdr, rec, tag.data(), dst, n_records); + + delete [] dst; + return(rec); + } + + bcf1_t* UpdateHtslibVcfRecordFormatFloat(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + uint32_t n_records = 0; + for(U32 i = 0; i < this->size(); ++i) + n_records += this->at(i).size(); + + float* dst = new float[n_records]; + uint32_t n_offset = 0; + for(U32 i = 0; i < this->size(); ++i){ + utility::FormatDataHtslib(this->at(i).data(), &dst[n_offset], this->at(i).size()); + n_offset += this->at(i).size(); + } + + bcf_update_format_float(hdr, rec, tag.data(), dst, n_records); + + delete [] dst; + return(rec); + } + + bcf1_t* UpdateHtslibVcfRecordFormatString(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + std::cerr << "illegal conversion from any value to string" << std::endl; + exit(1); + return(rec); + } private: template void __setup(const data_container_type& container, const U32& offset, const U32 n_objects, const U32 strides_each); private: - size_type __n_objects; pointer __containers; }; @@ -96,40 +132,48 @@ class PrimitiveGroupContainer{ template -PrimitiveGroupContainer::PrimitiveGroupContainer() : __n_objects(0), __containers(nullptr){} +PrimitiveGroupContainer::PrimitiveGroupContainer() : __containers(nullptr){} template PrimitiveGroupContainer::PrimitiveGroupContainer(const data_container_type& container, const U32& offset, const U32 n_objects, const U32 strides_each) : - __n_objects(n_objects), - __containers(static_cast(::operator new[](this->__n_objects*sizeof(value_type)))) + PrimitiveGroupContainerInterface(n_objects), + __containers(static_cast(::operator new[](this->n_objects_*sizeof(value_type)))) { - if(container.header.data_header.isSigned()){ - switch(container.header.data_header.getPrimitiveType()){ + if(container.header.data_header.IsSigned()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_16B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_32B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_64B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_FLOAT): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, offset, n_objects, strides_each)); break; - default: std::cerr << "Disallowed: " << container.header.data_header.getPrimitiveType() << std::endl; return; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): + default: std::cerr << "Disallowed: " << container.header.data_header.GetPrimitiveType() << std::endl; return; } } else { - switch(container.header.data_header.getPrimitiveType()){ + switch(container.header.data_header.GetPrimitiveType()){ case(YON_TYPE_8B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_16B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_32B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_64B): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_FLOAT): (this->__setup(container, offset, n_objects, strides_each)); break; case(YON_TYPE_DOUBLE): (this->__setup(container, offset, n_objects, strides_each)); break; - default: std::cerr << "Disallowed: " << container.header.data_header.getPrimitiveType() << std::endl; return; + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): + default: std::cerr << "Disallowed: " << container.header.data_header.GetPrimitiveType() << std::endl; return; } } } template PrimitiveGroupContainer::~PrimitiveGroupContainer(){ - for(std::size_t i = 0; i < this->__n_objects; ++i) + for(std::size_t i = 0; i < this->n_objects_; ++i) ((this->__containers + i)->~value_type)(); ::operator delete[](static_cast(this->__containers)); @@ -139,7 +183,7 @@ template template void PrimitiveGroupContainer::__setup(const data_container_type& container, const U32& offset, const U32 n_objects, const U32 strides_each){ U32 current_offset = offset; - for(U32 i = 0; i < this->__n_objects; ++i){ + for(U32 i = 0; i < this->n_objects_; ++i){ new( &this->__containers[i] ) value_type( container, current_offset, strides_each ); current_offset += strides_each * sizeof(actual_primitive_type); } diff --git a/lib/containers/primitive_group_container_string.cpp b/lib/containers/primitive_group_container_string.cpp index dbe6be3..462dac8 100644 --- a/lib/containers/primitive_group_container_string.cpp +++ b/lib/containers/primitive_group_container_string.cpp @@ -3,32 +3,35 @@ namespace tachyon{ namespace containers{ -PrimitiveGroupContainer::PrimitiveGroupContainer() : __n_objects(0), __strings(nullptr){} +PrimitiveGroupContainer::PrimitiveGroupContainer() : containers_(nullptr){} -PrimitiveGroupContainer::PrimitiveGroupContainer(const data_container_type& container, const U32& offset, const U32& n_entries, const U32 strides_each) : - __n_objects(n_entries), // limitation - __strings(static_cast(::operator new[](this->size()*sizeof(value_type)))) +PrimitiveGroupContainer::PrimitiveGroupContainer(const data_container_type& container, + const U32& offset, + const U32& n_entries, + const U32 strides_each) : + PrimitiveGroupContainerInterface(n_entries), // limitation + containers_(static_cast(::operator new[](this->size()*sizeof(value_type)))) { U32 current_offset = offset; for(size_type i = 0; i < this->size(); ++i){ // check length - U32 j = 0; + size_type j = 0; for(; j < strides_each; ++j){ // Find premature end-of-string marker if(container.buffer_data_uncompressed[current_offset + j] == '\0'){ break; } } - new( &this->__strings[i] ) value_type( &container.buffer_data_uncompressed[current_offset], j ); + new( &this->containers_[i] ) value_type( &container.buffer_data_uncompressed[current_offset], j ); current_offset += strides_each; } } PrimitiveGroupContainer::~PrimitiveGroupContainer(){ for(std::size_t i = 0; i < this->size(); ++i) - ((this->__strings + i)->~basic_string)(); + ((this->containers_ + i)->~PrimitiveContainer)(); - ::operator delete[](static_cast(this->__strings)); + ::operator delete[](static_cast(this->containers_)); } } diff --git a/lib/containers/primitive_group_container_string.h b/lib/containers/primitive_group_container_string.h index e3ea267..6b273d2 100644 --- a/lib/containers/primitive_group_container_string.h +++ b/lib/containers/primitive_group_container_string.h @@ -1,16 +1,17 @@ #ifndef CONTAINERS_PRIMITIVE_GROUP_CONTAINER_STRING_H_ #define CONTAINERS_PRIMITIVE_GROUP_CONTAINER_STRING_H_ +#include "components/generic_iterator.h" #include "primitive_group_container.h" namespace tachyon{ namespace containers{ template <> -class PrimitiveGroupContainer{ +class PrimitiveGroupContainer : public PrimitiveGroupContainerInterface{ private: typedef PrimitiveGroupContainer self_type; - typedef std::string value_type; + typedef PrimitiveContainer value_type; typedef std::size_t size_type; typedef value_type& reference; typedef const value_type& const_reference; @@ -19,72 +20,67 @@ class PrimitiveGroupContainer{ typedef std::ptrdiff_t difference_type; typedef DataContainer data_container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: PrimitiveGroupContainer(); PrimitiveGroupContainer(const data_container_type& container, const U32& offset, const U32& n_entries, const U32 strides_each); ~PrimitiveGroupContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access - inline reference at(const size_type& position){ return(this->__strings[position]); } - inline const_reference at(const size_type& position) const{ return(this->__strings[position]); } - inline reference operator[](const size_type& position){ return(this->__strings[position]); } - inline const_reference operator[](const size_type& position) const{ return(this->__strings[position]); } - inline pointer data(void){ return(this->__strings); } - inline const_pointer data(void) const{ return(this->__strings); } - inline reference front(void){ return(this->__strings[0]); } - inline const_reference front(void) const{ return(this->__strings[0]); } - inline reference back(void){ return(this->__strings[this->__n_objects - 1]); } - inline const_reference back(void) const{ return(this->__strings[this->__n_objects - 1]); } + inline reference at(const size_type& position){ return(this->containers_[position]); } + inline const_reference at(const size_type& position) const{ return(this->containers_[position]); } + inline reference operator[](const size_type& position){ return(this->containers_[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->containers_[position]); } + inline pointer data(void){ return(this->containers_); } + inline const_pointer data(void) const{ return(this->containers_); } + inline reference front(void){ return(this->containers_[0]); } + inline const_reference front(void) const{ return(this->containers_[0]); } + inline reference back(void){ return(this->containers_[this->n_objects_ - 1]); } + inline const_reference back(void) const{ return(this->containers_[this->n_objects_ - 1]); } // Capacity - inline const bool empty(void) const{ return(this->__n_objects == 0); } - inline const size_type& size(void) const{ return(this->__n_objects); } + inline bool empty(void) const{ return(this->n_objects_ == 0); } + inline const size_type& size(void) const{ return(this->n_objects_); } // Iterator - inline iterator begin(){ return iterator(&this->__strings[0]); } - inline iterator end(){ return iterator(&this->__strings[this->__n_objects]); } - inline const_iterator begin() const{ return const_iterator(&this->__strings[0]); } - inline const_iterator end() const{ return const_iterator(&this->__strings[this->__n_objects]); } - inline const_iterator cbegin() const{ return const_iterator(&this->__strings[0]); } - inline const_iterator cend() const{ return const_iterator(&this->__strings[this->__n_objects]); } + inline iterator begin(){ return iterator(&this->containers_[0]); } + inline iterator end(){ return iterator(&this->containers_[this->n_objects_]); } + inline const_iterator begin() const{ return const_iterator(&this->containers_[0]); } + inline const_iterator end() const{ return const_iterator(&this->containers_[this->n_objects_]); } + inline const_iterator cbegin() const{ return const_iterator(&this->containers_[0]); } + inline const_iterator cend() const{ return const_iterator(&this->containers_[this->n_objects_]); } + + io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const uint64_t position) const{ + this->at(position).to_vcf_string(buffer); + return(buffer); + } + + bcf1_t* UpdateHtslibVcfRecordFormatInt32(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + return(rec); + } + + bcf1_t* UpdateHtslibVcfRecordFormatFloat(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + return(rec); + } + + bcf1_t* UpdateHtslibVcfRecordFormatString(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag) const{ + const char** dst = new const char*[this->size()]; + + for(U32 i = 0; i < this->size(); ++i){ + dst[i] = this->at(i).data_.data(); + //std::cerr << dst[i] << std::endl; + } + + bcf_update_format_string(hdr, rec, tag.data(), dst, this->size()); + + delete [] dst; + return(rec); + } private: - size_type __n_objects; - pointer __strings; + pointer containers_; }; } diff --git a/lib/containers/stride_container.h b/lib/containers/stride_container.h index 3a28c4b..b58560c 100644 --- a/lib/containers/stride_container.h +++ b/lib/containers/stride_container.h @@ -3,6 +3,7 @@ #include +#include "components/generic_iterator.h" #include "data_container.h" namespace tachyon{ @@ -16,7 +17,7 @@ namespace containers{ */ template class StrideContainer{ -private: +public: typedef StrideContainer self_type; typedef std::size_t size_type; typedef return_primitive value_type; @@ -27,6 +28,9 @@ class StrideContainer{ typedef std::ptrdiff_t difference_type; typedef DataContainer data_container_type; + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + public: StrideContainer(); StrideContainer(const size_type start_capacity); @@ -35,40 +39,6 @@ class StrideContainer{ StrideContainer(const self_type& other); ~StrideContainer(void); - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - // Element access inline reference at(const size_type& position){ return(this->__entries[position]); } inline const_reference at(const size_type& position) const{ return(this->__entries[position]); } @@ -82,7 +52,7 @@ class StrideContainer{ inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } inline const size_type& capacity(void) const{ return(this->n_capacity); } @@ -142,8 +112,8 @@ class StrideContainer{ void __allocate(const data_container_type& container); // Todo: - const bool determineUniformity(void); - const int findSmallestPrimitive(void); + bool determineUniformity(void); + int findSmallestPrimitive(void); // 1) run find smallest primitive // 2) invoke stride container ctor with (larger_stride_container) @@ -216,11 +186,17 @@ StrideContainer::~StrideContainer(void){ template void StrideContainer::__setup(const data_container_type& container){ - switch(container.getStridePrimitiveType()){ + switch(container.GetStridePrimitiveType()){ case(YON_TYPE_8B): this->__allocate(container); break; case(YON_TYPE_16B): this->__allocate(container); break; case(YON_TYPE_32B): this->__allocate(container); break; case(YON_TYPE_64B): this->__allocate(container); break; + case(YON_TYPE_FLOAT): + case(YON_TYPE_DOUBLE): + case(YON_TYPE_BOOLEAN): + case(YON_TYPE_CHAR): + case(YON_TYPE_STRUCT): + case(YON_TYPE_UNKNOWN): default: std::cerr << utility::timestamp("ERROR") << "Illegal stride primitive: " << (int)container.header.stride_header.controller.type << std::endl; exit(1); } } diff --git a/lib/containers/variant_block.cpp b/lib/containers/variant_block.cpp index 0c45d5d..60f6752 100644 --- a/lib/containers/variant_block.cpp +++ b/lib/containers/variant_block.cpp @@ -7,138 +7,126 @@ namespace tachyon{ namespace containers{ VariantBlock::VariantBlock() : - info_containers(new container_type[200]), - format_containers(new container_type[200]), + n_info_c_allocated(0), + n_format_c_allocated(0), + base_containers(new container_type[YON_BLK_N_STATIC]), + info_containers(new container_type[0]), + format_containers(new container_type[0]), + gt_ppa(nullptr), end_block_(0), start_compressed_data_(0), end_compressed_data_(0) { - // Base container streams are always of type TYPE_STRUCT - this->meta_alleles_container.setType(YON_TYPE_STRUCT); - this->meta_controller_container.setType(tachyon::YON_TYPE_16B); - this->meta_refalt_container.setType(tachyon::YON_TYPE_8B); + this->base_containers[YON_BLK_ALLELES].SetType(YON_TYPE_STRUCT); + this->base_containers[YON_BLK_CONTROLLER].SetType(YON_TYPE_16B); + this->base_containers[YON_BLK_REFALT].SetType(YON_TYPE_8B); + this->footer_support.resize(65536); +} +VariantBlock::VariantBlock(const uint16_t n_info, const uint16_t n_format) : + n_info_c_allocated(n_info), + n_format_c_allocated(n_format), + base_containers(new container_type[YON_BLK_N_STATIC]), + info_containers(new container_type[n_info]), + format_containers(new container_type[n_format]), + gt_ppa(nullptr), + end_block_(0), + start_compressed_data_(0), + end_compressed_data_(0) +{ + this->base_containers[YON_BLK_ALLELES].SetType(YON_TYPE_STRUCT); + this->base_containers[YON_BLK_CONTROLLER].SetType(YON_TYPE_16B); + this->base_containers[YON_BLK_REFALT].SetType(YON_TYPE_8B); this->footer_support.resize(65536); } VariantBlock::~VariantBlock(){ + delete [] this->base_containers; delete [] this->info_containers; delete [] this->format_containers; + delete this->gt_ppa; } void VariantBlock::clear(void){ + for(U32 i = 0; i < YON_BLK_N_STATIC; ++i) + this->base_containers[i].reset(); + for(U32 i = 0; i < this->footer.n_info_streams; ++i) this->info_containers[i].reset(); for(U32 i = 0; i < this->footer.n_format_streams; ++i) this->format_containers[i].reset(); - this->header.reset(); - this->footer.reset(); - this->footer_support.reset(); + this->base_containers[YON_BLK_ALLELES].SetType(YON_TYPE_STRUCT); + this->base_containers[YON_BLK_CONTROLLER].SetType(YON_TYPE_16B); + this->base_containers[YON_BLK_REFALT].SetType(YON_TYPE_8B); - this->meta_contig_container.reset(); - this->meta_positions_container.reset(); - this->meta_refalt_container.reset(); - this->meta_controller_container.reset(); - this->meta_quality_container.reset(); - this->meta_names_container.reset(); - this->meta_alleles_container.reset(); - this->meta_info_map_ids.reset(); - this->meta_format_map_ids.reset(); - this->meta_filter_map_ids.reset(); - this->gt_support_data_container.reset(); - this->gt_rle8_container.reset(); - this->gt_rle16_container.reset(); - this->gt_rle32_container.reset(); - this->gt_rle64_container.reset(); - this->gt_simple8_container.reset(); - this->gt_simple16_container.reset(); - this->gt_simple32_container.reset(); - this->gt_simple64_container.reset(); - - // Base container data types are always TYPE_STRUCT - // Map ID fields are always S32 fields - this->meta_alleles_container.setType(YON_TYPE_STRUCT); - this->meta_controller_container.setType(tachyon::YON_TYPE_16B); - this->meta_refalt_container.setType(tachyon::YON_TYPE_8B); - - this->info_fields.clear(); - this->format_fields.clear(); - this->filter_fields.clear(); - this->info_patterns.clear(); - this->format_patterns.clear(); - this->filter_patterns.clear(); - - this->end_block_ = 0; + this->end_block_ = 0; this->start_compressed_data_ = 0; this->end_compressed_data_ = 0; - this->ppa_manager.reset(); + this->header.reset(); + this->footer.reset(); + this->footer_support.reset(); + + if(this->gt_ppa != nullptr) + this->gt_ppa->reset(); } void VariantBlock::resize(const U32 s){ if(s == 0) return; - this->meta_contig_container.resize(s); - this->meta_positions_container.resize(s); - this->meta_refalt_container.resize(s); - this->meta_controller_container.resize(s); - this->meta_quality_container.resize(s); - this->meta_names_container.resize(s); - this->meta_alleles_container.resize(s); - this->meta_info_map_ids.resize(s); - this->meta_format_map_ids.resize(s); - this->meta_filter_map_ids.resize(s); - this->gt_support_data_container.resize(s); - this->gt_rle8_container.resize(s); - this->gt_rle16_container.resize(s); - this->gt_rle32_container.resize(s); - this->gt_rle64_container.resize(s); - this->gt_simple8_container.resize(s); - this->gt_simple16_container.resize(s); - this->gt_simple32_container.resize(s); - this->gt_simple64_container.resize(s); - - for(U32 i = 0; i < 200; ++i){ + for(U32 i = 0; i < YON_BLK_N_STATIC; ++i) + this->base_containers[i].resize(s); + + for(U32 i = 0; i < n_info_c_allocated; ++i){ this->info_containers[i].resize(s); + } + + for(U32 i = 0; i < n_format_c_allocated; ++i){ this->format_containers[i].resize(s); } } -void VariantBlock::updateContainers(void){ - this->meta_contig_container.updateContainer(); - this->meta_positions_container.updateContainer(); - this->meta_refalt_container.updateContainer(); - this->meta_controller_container.updateContainer(false); - this->meta_quality_container.updateContainer(); - this->meta_names_container.updateContainer(); - this->meta_alleles_container.updateContainer(); - this->meta_filter_map_ids.updateContainer(); - this->meta_format_map_ids.updateContainer(); - this->meta_info_map_ids.updateContainer(); - this->gt_support_data_container.updateContainer(); - this->gt_rle8_container.updateContainer(false); - this->gt_rle16_container.updateContainer(false); - this->gt_rle32_container.updateContainer(false); - this->gt_rle64_container.updateContainer(false); - this->gt_simple8_container.updateContainer(false); - this->gt_simple16_container.updateContainer(false); - this->gt_simple32_container.updateContainer(false); - this->gt_simple64_container.updateContainer(false); +void VariantBlock::UpdateContainers(void){ + this->base_containers[YON_BLK_CONTIG].UpdateContainer(); + this->base_containers[YON_BLK_POSITION].UpdateContainer(); + this->base_containers[YON_BLK_REFALT].UpdateContainer(false, false); + this->base_containers[YON_BLK_QUALITY].UpdateContainer(); + this->base_containers[YON_BLK_NAMES].UpdateContainer(); + this->base_containers[YON_BLK_ALLELES].UpdateContainer(false, false); + this->base_containers[YON_BLK_ID_FILTER].UpdateContainer(); + this->base_containers[YON_BLK_ID_FORMAT].UpdateContainer(); + this->base_containers[YON_BLK_ID_INFO].UpdateContainer(); + this->base_containers[YON_BLK_GT_SUPPORT].UpdateContainer(); + this->base_containers[YON_BLK_GT_PLOIDY].UpdateContainer(); + + this->base_containers[YON_BLK_CONTROLLER].UpdateContainer(false, false); + this->base_containers[YON_BLK_GT_INT8].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_INT16].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_INT32].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_INT64].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_S_INT8].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_S_INT16].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_S_INT32].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_S_INT64].UpdateContainer(false, true); + this->base_containers[YON_BLK_GT_N_INT8].UpdateContainer(false, false); + this->base_containers[YON_BLK_GT_N_INT16].UpdateContainer(false, false); + this->base_containers[YON_BLK_GT_N_INT32].UpdateContainer(false, false); + this->base_containers[YON_BLK_GT_N_INT64].UpdateContainer(false, false); for(U32 i = 0; i < this->footer.n_info_streams; ++i){ assert(this->info_containers[i].header.data_header.stride != 0); - this->info_containers[i].updateContainer(); + this->info_containers[i].UpdateContainer(); } for(U32 i = 0; i < this->footer.n_format_streams; ++i){ assert(this->format_containers[i].header.data_header.stride != 0); - this->format_containers[i].updateContainer(); + this->format_containers[i].UpdateContainer(); } } -bool VariantBlock::readHeaderFooter(std::ifstream& stream){ +bool VariantBlock::ReadHeaderFooter(std::ifstream& stream){ if(!stream.good()){ std::cerr << utility::timestamp("ERROR") << "File stream is corrupted..." << std::endl; return false; @@ -147,26 +135,30 @@ bool VariantBlock::readHeaderFooter(std::ifstream& stream){ stream >> this->header; // load header this->start_compressed_data_ = (U64)stream.tellg(); // start of compressed data stream.seekg(this->start_compressed_data_ + this->header.l_offset_footer); // seek to start of footer - this->end_compressed_data_ = stream.tellg(); // end of compressed data + this->end_compressed_data_ = stream.tellg(); // end of compressed data + + assert(stream.good()); U32 footer_uLength = 0; U32 footer_cLength = 0; - U32 footer_crc = 0; - stream.read(reinterpret_cast(&footer_uLength), sizeof(U32)); - stream.read(reinterpret_cast(&footer_cLength), sizeof(U32)); - stream.read(reinterpret_cast(&footer_crc), sizeof(U32)); + uint8_t footer_crc[MD5_DIGEST_LENGTH]; + utility::DeserializePrimitive(footer_uLength, stream); + utility::DeserializePrimitive(footer_cLength, stream); + stream.read(reinterpret_cast(&footer_crc[0]), MD5_DIGEST_LENGTH); + this->footer_support.resize(footer_cLength); stream.read(this->footer_support.buffer_data.data(), footer_cLength); this->footer_support.buffer_data.n_chars = footer_cLength; this->footer_support.buffer_data_uncompressed.resize(footer_uLength); + this->footer_support.buffer_data_uncompressed.n_chars = footer_uLength; this->footer_support.header.data_header.controller.encoder = YON_ENCODE_ZSTD; - this->footer_support.header.data_header.cLength = footer_cLength; - this->footer_support.header.data_header.uLength = footer_uLength; - this->footer_support.header.data_header.crc = footer_crc; + this->footer_support.header.data_header.cLength = footer_cLength; + this->footer_support.header.data_header.uLength = footer_uLength; + memcpy(&this->footer_support.header.data_header.crc[0], &footer_crc[0], MD5_DIGEST_LENGTH); // Assert end-of-block marker U64 eof_marker; - stream.read(reinterpret_cast(&eof_marker), sizeof(U64)); + utility::DeserializePrimitive(eof_marker, stream); assert(eof_marker == constants::TACHYON_BLOCK_EOF); this->end_block_ = stream.tellg(); // end-of-block offset stream.seekg(this->start_compressed_data_); @@ -175,44 +167,32 @@ bool VariantBlock::readHeaderFooter(std::ifstream& stream){ bool VariantBlock::read(std::ifstream& stream){ if(this->header.controller.hasGTPermuted && this->header.controller.hasGT){ - this->ppa_manager.header = this->footer.offset_ppa; - stream.seekg(this->start_compressed_data_ + this->footer.offset_ppa.data_header.offset); - stream >> this->ppa_manager; + stream.seekg(this->start_compressed_data_ + this->footer.offsets[YON_BLK_PPA].data_header.offset); + stream >> this->base_containers[YON_BLK_PPA]; } - this->__loadContainer(stream, this->footer.offset_meta_contig, this->meta_contig_container); - this->__loadContainer(stream, this->footer.offset_meta_position, this->meta_positions_container); - this->__loadContainer(stream, this->footer.offset_meta_controllers, this->meta_controller_container); - this->__loadContainer(stream, this->footer.offset_meta_quality, this->meta_quality_container); - this->__loadContainer(stream, this->footer.offset_meta_names, this->meta_names_container); - this->__loadContainer(stream, this->footer.offset_meta_refalt, this->meta_refalt_container); - this->__loadContainer(stream, this->footer.offset_meta_alleles, this->meta_alleles_container); - this->__loadContainer(stream, this->footer.offset_gt_8b, this->gt_rle8_container); - this->__loadContainer(stream, this->footer.offset_gt_16b, this->gt_rle16_container); - this->__loadContainer(stream, this->footer.offset_gt_32b, this->gt_rle32_container); - this->__loadContainer(stream, this->footer.offset_gt_64b, this->gt_rle64_container); - this->__loadContainer(stream, this->footer.offset_gt_simple8, this->gt_simple8_container); - this->__loadContainer(stream, this->footer.offset_gt_simple16, this->gt_simple16_container); - this->__loadContainer(stream, this->footer.offset_gt_simple32, this->gt_simple32_container); - this->__loadContainer(stream, this->footer.offset_gt_simple64, this->gt_simple64_container); - this->__loadContainer(stream, this->footer.offset_gt_helper, this->gt_support_data_container); - this->__loadContainer(stream, this->footer.offset_meta_info_id, this->meta_info_map_ids); - this->__loadContainer(stream, this->footer.offset_meta_filter_id, this->meta_filter_map_ids); - this->__loadContainer(stream, this->footer.offset_meta_format_id, this->meta_format_map_ids); + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i) + this->LoadContainer(stream, this->footer.offsets[i], this->base_containers[i]); // Load all INFO + delete [] this->info_containers; + this->info_containers = new container_type[this->footer.n_info_streams]; + this->n_info_c_allocated = this->footer.n_info_streams; if(this->footer.n_info_streams){ stream.seekg(this->start_compressed_data_ + this->footer.info_offsets[0].data_header.offset); for(U32 i = 0; i < this->footer.n_info_streams; ++i) - this->__loadContainer(stream, this->footer.info_offsets[i], this->info_containers[i]); + this->LoadContainer(stream, this->footer.info_offsets[i], this->info_containers[i]); } // Load all FORMAT + delete [] this->format_containers; + this->format_containers = new container_type[this->footer.n_format_streams]; + this->n_format_c_allocated = this->footer.n_format_streams; if(this->footer.n_format_streams){ stream.seekg(this->start_compressed_data_ + this->footer.format_offsets[0].data_header.offset); for(U32 i = 0; i < this->footer.n_format_streams; ++i) - this->__loadContainer(stream, this->footer.format_offsets[i], this->format_containers[i]); + this->LoadContainer(stream, this->footer.format_offsets[i], this->format_containers[i]); // EOF assertion assert(this->end_compressed_data_ == (U64)stream.tellg()); @@ -222,70 +202,35 @@ bool VariantBlock::read(std::ifstream& stream){ return(true); } -const U64 VariantBlock::__determineCompressedSize(void) const{ +U64 VariantBlock::DetermineCompressedSize(void) const{ U64 total = 0; if(this->header.controller.hasGT && this->header.controller.hasGTPermuted) - total += this->ppa_manager.getObjectSize(); - - total += this->meta_contig_container.getObjectSize(); - total += this->meta_positions_container.getObjectSize(); - total += this->meta_refalt_container.getObjectSize(); - total += this->meta_controller_container.getObjectSize(); - total += this->meta_quality_container.getObjectSize(); - total += this->meta_names_container.getObjectSize(); - total += this->meta_alleles_container.getObjectSize(); - total += this->meta_info_map_ids.getObjectSize(); - total += this->meta_format_map_ids.getObjectSize(); - total += this->meta_filter_map_ids.getObjectSize(); - total += this->gt_support_data_container.getObjectSize(); - total += this->gt_rle8_container.getObjectSize(); - total += this->gt_rle16_container.getObjectSize(); - total += this->gt_rle32_container.getObjectSize(); - total += this->gt_rle64_container.getObjectSize(); - total += this->gt_simple8_container.getObjectSize(); - total += this->gt_simple16_container.getObjectSize(); - total += this->gt_simple32_container.getObjectSize(); - total += this->gt_simple64_container.getObjectSize(); - - for(U32 i = 0; i < this->footer.n_info_streams; ++i) total += this->info_containers[i].getObjectSize(); - for(U32 i = 0; i < this->footer.n_format_streams; ++i) total += this->format_containers[i].getObjectSize(); + total += this->base_containers[YON_BLK_PPA].GetObjectSize(); + + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i) total += this->base_containers[i].GetObjectSize(); + for(U32 i = 0; i < this->footer.n_info_streams; ++i) total += this->info_containers[i].GetObjectSize(); + for(U32 i = 0; i < this->footer.n_format_streams; ++i) total += this->format_containers[i].GetObjectSize(); return(total); } -void VariantBlock::updateOutputStatistics(import_stats_type& stats_basic, import_stats_type& stats_info, import_stats_type& stats_format){ - if(this->header.controller.hasGT && this->header.controller.hasGTPermuted){ - stats_basic[1].cost_uncompressed += this->ppa_manager.header.data_header.uLength; - stats_basic[1].cost_compressed += this->ppa_manager.header.data_header.cLength; - } +void VariantBlock::UpdateOutputStatistics(import_stats_type& stats_basic, + import_stats_type& stats_info, + import_stats_type& stats_format) +{ + if(this->header.controller.hasGT && this->header.controller.hasGTPermuted) + stats_basic[1] += this->base_containers[YON_BLK_PPA]; - stats_basic[2] += this->meta_contig_container; - stats_basic[3] += this->meta_positions_container; - stats_basic[4] += this->meta_refalt_container; - stats_basic[5] += this->meta_controller_container; - stats_basic[6] += this->meta_quality_container; - stats_basic[7] += this->meta_names_container; - stats_basic[8] += this->meta_alleles_container; - stats_basic[9] += this->meta_info_map_ids; - stats_basic[10] += this->meta_format_map_ids; - stats_basic[11] += this->meta_filter_map_ids; - stats_basic[12] += this->gt_support_data_container; - stats_basic[13] += this->gt_rle8_container; - stats_basic[14] += this->gt_rle16_container; - stats_basic[15] += this->gt_rle32_container; - stats_basic[16] += this->gt_rle64_container; - stats_basic[17] += this->gt_simple8_container; - stats_basic[18] += this->gt_simple16_container; - stats_basic[19] += this->gt_simple32_container; - stats_basic[20] += this->gt_simple64_container; + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i) + stats_basic[i+1] += this->base_containers[i]; for(U32 i = 0; i < this->footer.n_info_streams; ++i){ - stats_basic[21] += this->info_containers[i]; + stats_basic[22] += this->info_containers[i]; stats_info[this->footer.info_offsets[i].data_header.global_key] += this->info_containers[i]; } for(U32 i = 0; i < this->footer.n_format_streams; ++i){ - stats_basic[22] += this->format_containers[i]; + stats_basic[23] += this->format_containers[i]; stats_format[this->footer.format_offsets[i].data_header.global_key] += this->format_containers[i]; } } @@ -295,108 +240,98 @@ bool VariantBlock::write(std::ostream& stream, import_stats_type& stats_info, import_stats_type& stats_format) { + if(stream.good() == false){ + return false; + } + const U64 begin_pos = stream.tellp(); - this->header.l_offset_footer = this->__determineCompressedSize(); + this->header.l_offset_footer = this->DetermineCompressedSize(); stream << this->header; const U64 start_pos = stream.tellp(); stats_basic[0].cost_uncompressed += start_pos - begin_pos; - if(this->header.controller.hasGT && this->header.controller.hasGTPermuted){ - this->footer.offset_ppa = this->ppa_manager.header; - this->footer.offset_ppa.data_header.offset = (U64)stream.tellp() - start_pos; - stream << this->ppa_manager; - } + if(this->header.controller.hasGT && this->header.controller.hasGTPermuted) + this->WriteContainer(stream, this->footer.offsets[YON_BLK_PPA], this->base_containers[YON_BLK_PPA], (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_contig, this->meta_contig_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_position, this->meta_positions_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_controllers, this->meta_controller_container,(U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_quality, this->meta_quality_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_names, this->meta_names_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_refalt, this->meta_refalt_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_alleles, this->meta_alleles_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_info_id, this->meta_info_map_ids, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_filter_id, this->meta_filter_map_ids, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_meta_format_id, this->meta_format_map_ids, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_helper, this->gt_support_data_container,(U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_8b, this->gt_rle8_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_16b, this->gt_rle16_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_32b, this->gt_rle32_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_64b, this->gt_rle64_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_simple8, this->gt_simple8_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_simple16, this->gt_simple16_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_simple32, this->gt_simple32_container, (U64)stream.tellp() - start_pos); - this->__writeContainer(stream, this->footer.offset_gt_simple64, this->gt_simple64_container, (U64)stream.tellp() - start_pos); + // Start at offset 1 because offset 0 is encoding for the genotype + // permutation array that is handled differently. + for(U32 i = 1; i < YON_BLK_N_STATIC; ++i) + this->WriteContainer(stream, this->footer.offsets[i], this->base_containers[i], (U64)stream.tellp() - start_pos); for(U32 i = 0; i < this->footer.n_info_streams; ++i) - this->__writeContainer(stream, this->footer.info_offsets[i], this->info_containers[i], (U64)stream.tellp() - start_pos); + this->WriteContainer(stream, this->footer.info_offsets[i], this->info_containers[i], (U64)stream.tellp() - start_pos); for(U32 i = 0; i < this->footer.n_format_streams; ++i) - this->__writeContainer(stream, this->footer.format_offsets[i], this->format_containers[i], (U64)stream.tellp() - start_pos); + this->WriteContainer(stream, this->footer.format_offsets[i], this->format_containers[i], (U64)stream.tellp() - start_pos); - // writing footer + // Assert that the written amount equals the expected amount. assert(this->header.l_offset_footer == (U64)stream.tellp() - start_pos); // Update stats - this->updateOutputStatistics(stats_basic, stats_info, stats_format); + this->UpdateOutputStatistics(stats_basic, stats_info, stats_format); return(stream.good()); } bool VariantBlock::operator+=(meta_entry_type& meta_entry){ // Meta positions - this->meta_positions_container.Add((S32)meta_entry.position); - ++this->meta_positions_container; + this->base_containers[YON_BLK_POSITION].Add((S32)meta_entry.position); + ++this->base_containers[YON_BLK_POSITION]; // Contig ID - this->meta_contig_container.Add((S32)meta_entry.contigID); - ++this->meta_contig_container; + this->base_containers[YON_BLK_CONTIG].Add((S32)meta_entry.contigID); + ++this->base_containers[YON_BLK_CONTIG]; // Ref-alt data - if(meta_entry.usePackedRefAlt()){ // Is simple SNV and possible extra case when in gVCF + if(meta_entry.UsePackedRefAlt()){ // Is simple SNV and possible extra case when in gVCF meta_entry.controller.alleles_packed = true; - const BYTE ref_alt = meta_entry.packRefAltByte(); - this->meta_refalt_container.AddLiteral(ref_alt); - ++this->meta_refalt_container; + const BYTE ref_alt = meta_entry.PackRefAltByte(); + this->base_containers[YON_BLK_REFALT].AddLiteral(ref_alt); + ++this->base_containers[YON_BLK_REFALT]; } // add complex else { // Special encoding for(U32 i = 0; i < meta_entry.n_alleles; ++i){ // Write out allele - this->meta_alleles_container.AddLiteral((U16)meta_entry.alleles[i].l_allele); - this->meta_alleles_container.AddCharacter(meta_entry.alleles[i].allele, meta_entry.alleles[i].l_allele); + this->base_containers[YON_BLK_ALLELES].AddLiteral((U16)meta_entry.alleles[i].l_allele); + this->base_containers[YON_BLK_ALLELES].AddCharacter(meta_entry.alleles[i].allele, meta_entry.alleles[i].l_allele); } - ++this->meta_alleles_container; // update before to not trigger - this->meta_alleles_container.addStride(meta_entry.n_alleles); + ++this->base_containers[YON_BLK_ALLELES]; // update before to not trigger + this->base_containers[YON_BLK_ALLELES].AddStride(meta_entry.n_alleles); } // Quality - this->meta_quality_container.Add(meta_entry.quality); - ++this->meta_quality_container; + this->base_containers[YON_BLK_QUALITY].Add(meta_entry.quality); + ++this->base_containers[YON_BLK_QUALITY]; // Variant name - this->meta_names_container.addStride(meta_entry.name.size()); - this->meta_names_container.AddCharacter(meta_entry.name); - ++this->meta_names_container; + this->base_containers[YON_BLK_NAMES].AddStride(meta_entry.name.size()); + this->base_containers[YON_BLK_NAMES].AddCharacter(meta_entry.name); + ++this->base_containers[YON_BLK_NAMES]; // Tachyon pattern identifiers - this->meta_info_map_ids.Add(meta_entry.info_pattern_id); - this->meta_format_map_ids.Add(meta_entry.format_pattern_id); - this->meta_filter_map_ids.Add(meta_entry.filter_pattern_id); - ++this->meta_info_map_ids; - ++this->meta_format_map_ids; - ++this->meta_filter_map_ids; + this->base_containers[YON_BLK_ID_INFO].Add(meta_entry.info_pattern_id); + this->base_containers[YON_BLK_ID_FORMAT].Add(meta_entry.format_pattern_id); + this->base_containers[YON_BLK_ID_FILTER].Add(meta_entry.filter_pattern_id); + ++this->base_containers[YON_BLK_ID_INFO]; + ++this->base_containers[YON_BLK_ID_FORMAT]; + ++this->base_containers[YON_BLK_ID_FILTER]; // Check if all variants are of length 1 (as in all alleles are SNVs) bool all_snv = true; for(U32 i = 0; i < meta_entry.n_alleles; ++i){ - if(meta_entry.alleles[i].l_allele != 1) all_snv = false; + if(meta_entry.alleles[i].size() != 1) all_snv = false; } meta_entry.controller.all_snv = all_snv; // Controller - this->meta_controller_container.AddLiteral((U16)meta_entry.controller.toValue()); // has been overloaded - ++this->meta_controller_container; + this->base_containers[YON_BLK_CONTROLLER].AddLiteral((U16)meta_entry.controller.toValue()); // has been overloaded + ++this->base_containers[YON_BLK_CONTROLLER]; + + // Ploidy + this->base_containers[YON_BLK_GT_PLOIDY].Add(meta_entry.n_base_ploidy); + ++this->base_containers[YON_BLK_GT_PLOIDY]; return true; } diff --git a/lib/containers/variant_block.h b/lib/containers/variant_block.h index 45108a4..0012a30 100644 --- a/lib/containers/variant_block.h +++ b/lib/containers/variant_block.h @@ -1,14 +1,17 @@ #ifndef CORE_BLOCKENTRY_H_ #define CORE_BLOCKENTRY_H_ -#include "algorithm/permutation/permutation_manager.h" +#include + +#include "third_party/xxhash/xxhash.h" + #include "components/variant_block_footer.h" #include "components/variant_block_header.h" #include "core/data_block_settings.h" #include "data_container.h" #include "core/meta_entry.h" #include "core/variant_importer_container_stats.h" -#include "io/vcf/VCFHeader.h" +#include "core/genotypes.h" namespace tachyon{ namespace containers{ @@ -19,13 +22,11 @@ namespace containers{ * contents. */ class VariantBlock{ +public: typedef VariantBlock self_type; typedef DataContainer container_type; - typedef algorithm::PermutationManager permutation_type; typedef VariantBlockHeader block_header_type; typedef VariantBlockFooter block_footer_type; - typedef HashContainer hash_container_type; - typedef HashVectorContainer hash_vector_container_type; typedef io::BasicBuffer buffer_type; typedef support::VariantImporterContainerStats import_stats_type; typedef DataContainerHeader offset_type; @@ -33,8 +34,27 @@ class VariantBlock{ public: VariantBlock(); + VariantBlock(const uint16_t n_info, const uint16_t n_format); ~VariantBlock(); + void Allocate(const uint16_t n_info, + const uint16_t n_format, + const uint16_t n_filter) + { + // Allocate space for INFO containers. + delete [] this->info_containers; + this->info_containers = new container_type[n_info]; + this->n_info_c_allocated = n_info; + + // Allocate space for FORMAT containers. + delete [] this->format_containers; + this->format_containers = new container_type[n_format]; + this->n_format_c_allocated = n_format; + + // Alocate space for headers. + this->footer.AllocateHeaders(n_info, n_format, n_filter); + } + /**< @brief Resize base container buffer streams * Internal use only * @param s Size in bytes @@ -50,72 +70,6 @@ class VariantBlock{ inline const U32& size(void) const{ return(this->header.n_variants); } - // - //inline const size_t getINFOLoaded(void) const{ return(this->info_loaded.size()); } - //inline const size_t getFORMATLoaded(void) const{ return(this->format_loaded.size()); } - - inline U32 addFieldINFO(const U32 fieldID){ return(this->info_fields.setGet(fieldID)); } - inline U32 addFieldFORMAT(const U32 fieldID){ return(this->format_fields.setGet(fieldID)); } - inline U32 addFieldFILTER(const U32 fieldID){ return(this->filter_fields.setGet(fieldID)); } - - inline const S32 getPatternsINFO(const U64& hash_pattern) const{ - U32 mapID = 0; - if(this->info_patterns.getRaw(hash_pattern, mapID)) - return(mapID); - else return(-1); - } - - inline const S32 getPatternsFORMAT(const U64& hash_pattern) const{ - U32 mapID = 0; - if(this->format_patterns.getRaw(hash_pattern, mapID)) - return(mapID); - else return(-1); - } - - inline const S32 getPatternsFILTER(const U64& hash_pattern) const{ - U32 mapID = 0; - if(this->filter_patterns.getRaw(hash_pattern, mapID)) - return(mapID); - else return(-1); - } - - inline void addPatternINFO(const std::vector& pattern, const U64& hash_pattern){ - if(!this->info_patterns.set(pattern, hash_pattern)){ - std::cerr << "failed to insert filter: " << pattern.size() << " and " << hash_pattern << std::endl; - exit(1); - } - } - - inline void addPatternFORMAT(const std::vector& pattern, const U64& hash_pattern){ - if(!this->format_patterns.set(pattern, hash_pattern)){ - std::cerr << "failed to insert filter: " << pattern.size() << " and " << hash_pattern << std::endl; - exit(1); - } - } - - inline void addPatternFILTER(const std::vector& pattern, const U64& hash_pattern){ - if(!this->filter_patterns.set(pattern, hash_pattern)){ - std::cerr << "failed to insert filter: " << pattern.size() << " and " << hash_pattern << std::endl; - exit(1); - } - } - - /**< - * Finalize this block before writing to disk. This wrapper function - * calls all necessary functions to construct a valid Tachyon block - * for sequence variant data - */ - inline void finalize(void){ - this->footer.n_info_streams = this->info_fields.size(); - this->footer.n_filter_streams = this->filter_fields.size(); - this->footer.n_format_streams = this->format_fields.size(); - this->footer.allocateDiskOffsets(this->footer.n_info_streams, this->footer.n_format_streams, this->footer.n_filter_streams); - this->updateContainers(); - this->footer.constructBitVector(containers::VariantBlockFooter::INDEX_INFO, this->info_fields, this->info_patterns); - this->footer.constructBitVector(containers::VariantBlockFooter::INDEX_FILTER, this->filter_fields, this->filter_patterns); - this->footer.constructBitVector(containers::VariantBlockFooter::INDEX_FORMAT, this->format_fields, this->format_patterns); - } - /**< @brief Reads all digital objects from disk * Primary function for reading data from disk. Data * read in this way is not checked for integrity here. @@ -129,7 +83,7 @@ class VariantBlock{ * @param stream * @return */ - bool readHeaderFooter(std::ifstream& stream); + bool ReadHeaderFooter(std::ifstream& stream); /**< * Standard way of writing out a YON block. @@ -156,13 +110,13 @@ class VariantBlock{ * @param info_ids Vector of global INFO keys * @return Returns the set intersection of provided keys and local keys */ - std::vector intersectInfoKeys(const std::vector& info_ids) const{ - std::vector info_ids_found; - if(info_ids.size() == 0) return(info_ids_found); + std::vector IntersectInfoKeys(const std::vector& info_ids_global) const{ + std::vector info_ids_found; + if(info_ids_global.size() == 0) return(info_ids_found); - for(U32 i = 0; i < info_ids.size(); ++i){ + for(U32 i = 0; i < info_ids_global.size(); ++i){ for(U32 j = 0; j < this->footer.n_info_streams; ++j){ - if(this->footer.info_offsets[j].data_header.global_key == info_ids[i]) + if(this->footer.info_offsets[j].data_header.global_key == info_ids_global[i]) info_ids_found.push_back(this->footer.info_offsets[j].data_header.global_key); } } @@ -170,27 +124,58 @@ class VariantBlock{ return(info_ids_found); } + std::vector IntersectInfoPatterns(const std::vector& info_ids_global, const uint32_t local_id) const{ + std::vector info_ids_found; + if(info_ids_global.size() == 0) return(info_ids_found); + assert(local_id < this->footer.n_info_patterns); + + for(U32 i = 0; i < info_ids_global.size(); ++i){ + for(U32 k = 0; k < this->footer.info_patterns[local_id].pattern.size(); ++k){ + if(this->footer.info_patterns[local_id].pattern[k] == info_ids_global[i]){ + info_ids_found.push_back(this->footer.info_patterns[local_id].pattern[k]); + } + } + } + + return(info_ids_found); + } + /**< * Compares a vector of global FORMAT identifiers to the identifier set in this * block and returns the set intersection of keys * @param info_ids Vector of global FORMAT keys * @return Returns the set intersection of provided keys and local keys */ - std::vector intersectFormatKeys(const std::vector& format_ids) const{ - std::vector format_ids_found; - if(format_ids.size() == 0) return(format_ids_found); + std::vector IntersectFormatKeys(const std::vector& format_ids_global) const{ + std::vector format_ids_found; + if(format_ids_global.size() == 0) return(format_ids_found); + + for(U32 i = 0; i < format_ids_global.size(); ++i){ + for(U32 j = 0; j < this->footer.n_format_streams; ++j){ + if(this->footer.format_offsets[j].data_header.global_key == format_ids_global[i]) + format_ids_found.push_back(this->footer.format_offsets[j].data_header.global_key); + } + } - for(U32 i = 0; i < format_ids.size(); ++i){ - for(U32 j = 0; j < this->footer.n_info_streams; ++j){ - if(this->footer.info_offsets[j].data_header.global_key == format_ids[i]) - format_ids_found.push_back(this->footer.info_offsets[j].data_header.global_key); + return(format_ids_found); + } + + std::vector IntersectFormatPatterns(const std::vector& format_ids_global, const uint32_t local_id) const{ + std::vector format_ids_found; + if(format_ids_global.size() == 0) return(format_ids_found); + assert(local_id < this->footer.n_format_patterns); + + for(U32 i = 0; i < format_ids_global.size(); ++i){ + for(U32 k = 0; k < this->footer.format_patterns[local_id].pattern.size(); ++k){ + if(this->footer.format_patterns[local_id].pattern[k] == format_ids_global[i]) + format_ids_found.push_back(this->footer.format_patterns[local_id].pattern[k]); } } return(format_ids_found); } - std::vector getFormatKeys(void) const{ + std::vector GetFormatKeys(void) const{ std::vector ret; for(U32 i = 0; i < this->footer.n_format_streams; ++i) ret.push_back(this->footer.format_offsets[i].data_header.global_key); @@ -198,7 +183,7 @@ class VariantBlock{ return(ret); } - std::vector getInfoKeys(void) const{ + std::vector GetInfoKeys(void) const{ std::vector ret; for(U32 i = 0; i < this->footer.n_info_streams; ++i) ret.push_back(this->footer.info_offsets[i].data_header.global_key); @@ -213,7 +198,10 @@ class VariantBlock{ * @param container Destination container object * @return */ - inline bool __loadContainer(std::ifstream& stream, const offset_type& offset, container_type& container){ + inline bool LoadContainer(std::ifstream& stream, + const offset_type& offset, + container_type& container) + { container.header = offset; stream >> container; assert(container.header == offset); @@ -228,7 +216,10 @@ class VariantBlock{ * @param container Destination container object * @return */ - inline bool __loadContainerSeek(std::ifstream& stream, const offset_type& offset, container_type& container){ + inline bool LoadContainerSeek(std::ifstream& stream, + const offset_type& offset, + container_type& container) + { stream.seekg(this->start_compressed_data_ + offset.data_header.offset); container.header = offset; stream >> container; @@ -236,7 +227,6 @@ class VariantBlock{ return(stream.good()); } -private: /**< @brief Update base container header data and evaluate output byte streams * Internal use only (import): Collectively updates base * container offsets and checks/builds @@ -244,29 +234,141 @@ class VariantBlock{ * 2) Generates CRC checksums for both data and strides * 3) Reformat (change used primitive type) for strides and data; if possible */ - void updateContainers(void); + void UpdateContainers(void); /**< * Determine compressed block-size. Execute this function prior to writing a * block * @return Returns the sum total disk size */ - const U64 __determineCompressedSize(void) const; + U64 DetermineCompressedSize(void) const; + + inline void PackFooter(void){ + this->footer_support.reset(); + this->footer_support.buffer_data_uncompressed << this->footer; + ++this->footer_support; + } + + inline U32 AddInfoPattern(const std::vector& pattern){ return(this->footer.AddInfoPattern(pattern)); } + inline U32 AddFormatPattern(const std::vector& pattern){ return(this->footer.AddFormatPattern(pattern)); } + inline U32 AddFilterPattern(const std::vector& pattern){ return(this->footer.AddFilterPattern(pattern)); } + inline U32 AddInfo(const U32 id){ return(this->footer.AddInfo(id)); } + inline U32 AddFormat(const U32 id){ return(this->footer.AddFormat(id)); } + inline U32 AddFilter(const U32 id){ return(this->footer.AddFilter(id)); } + inline void Finalize(void){ this->footer.Finalize(); } + + S32 GetInfoPosition(const U32 global_id) const{ + if(this->footer.info_map == nullptr) return false; + VariantBlockFooter::map_type::const_iterator it = this->footer.info_map->find(global_id); + if(it == this->footer.info_map->end()) return -1; + return(it->second); + } + + S32 GetFormatPosition(const U32 global_id) const{ + if(this->footer.format_map == nullptr) return false; + VariantBlockFooter::map_type::const_iterator it = this->footer.format_map->find(global_id); + if(it == this->footer.format_map->end()) return -1; + return(it->second); + } + S32 GetFilterPosition(const U32 global_id) const{ + if(this->footer.filter_map == nullptr) return false; + VariantBlockFooter::map_type::const_iterator it = this->footer.filter_map->find(global_id); + if(it == this->footer.filter_map->end()) return -1; + return(it->second); + } + + bool HasInfo(const U32 global_id) const{ + if(this->footer.info_map == nullptr) return false; + VariantBlockFooter::map_type::const_iterator it = this->footer.info_map->find(global_id); + if(it == this->footer.info_map->end()) return false; + return(true); + } + + bool HasFormat(const U32 global_id) const{ + if(this->footer.format_map == nullptr) return false; + VariantBlockFooter::map_type::const_iterator it = this->footer.format_map->find(global_id); + if(it == this->footer.format_map->end()) return false; + return(true); + } + + bool HasFilter(const U32 global_id) const{ + if(this->footer.filter_map == nullptr) return false; + VariantBlockFooter::map_type::const_iterator it = this->footer.filter_map->find(global_id); + if(it == this->footer.filter_map->end()) return false; + return(true); + } + + container_type* GetInfoContainer(const U32 global_id) const{ + if(this->HasInfo(global_id)) + return(&this->info_containers[this->footer.info_map->at(global_id)]); + else + return nullptr; + } + + container_type* GetFormatContainer(const U32 global_id) const{ + if(this->HasFormat(global_id)) + return(&this->format_containers[this->footer.format_map->at(global_id)]); + else + return nullptr; + } + + std::vector InfoPatternSetMembership(const int value) const{ + std::vector matches(this->footer.n_info_patterns, false); + for(U32 i = 0; i < this->footer.n_info_patterns; ++i){ + for(U32 j = 0; j < this->footer.info_patterns[i].pattern.size(); ++j){ + if(this->footer.info_patterns[i].pattern[j] == value){ + matches[i] = true; + break; + } + } + } + return(matches); + } + + std::vector FormatPatternSetMembership(const int value) const{ + std::vector matches(this->footer.n_format_patterns, false); + for(U32 i = 0; i < this->footer.n_format_patterns; ++i){ + for(U32 j = 0; j < this->footer.format_patterns[i].pattern.size(); ++j){ + if(this->footer.format_patterns[i].pattern[j] == value){ + matches[i] = true; + break; + } + } + } + return(matches); + } + + std::vector FilterPatternSetMembership(const int value) const{ + std::vector matches(this->footer.n_filter_patterns, false); + for(U32 i = 0; i < this->footer.n_filter_patterns; ++i){ + for(U32 j = 0; j < this->footer.filter_patterns[i].pattern.size(); ++j){ + if(this->footer.filter_patterns[i].pattern[j] == value){ + matches[i] = true; + break; + } + } + } + return(matches); + } + +private: /**< * * @param stats_basic * @param stats_info * @param stats_format */ - void updateOutputStatistics(import_stats_type& stats_basic, import_stats_type& stats_info, import_stats_type& stats_format); + void UpdateOutputStatistics(import_stats_type& stats_basic, + import_stats_type& stats_info, + import_stats_type& stats_format); /**< * Move over pair of headers from a data container to a block footer * @param offset Destination header in footer * @param container Target container hosting the header */ - inline void __updateHeader(offset_type& offset, const container_type& container){ + inline void UpdateHeader(offset_type& offset, const container_type& container){ const U32 global_key = offset.data_header.global_key; // carry over global key offset = container.header; assert(offset == container.header); // Assert copy is correct @@ -279,7 +381,10 @@ class VariantBlock{ * @param container Target container hosting the header * @param virtual_offset Block virtual offset */ - inline void __updateHeader(offset_type& offset, const container_type& container, const U32& virtual_offset){ + inline void UpdateHeader(offset_type& offset, + const container_type& container, + const U32& virtual_offset) + { const U32 global_key = offset.data_header.global_key; // carry over global key offset = container.header; assert(offset == container.header); // Assert copy is correct @@ -294,11 +399,15 @@ class VariantBlock{ * @param container * @param virtual_offset */ - inline void __writeContainer(std::ostream& stream, offset_type& offset, const container_type& container, const U32 virtual_offset){ + inline void WriteContainer(std::ostream& stream, + offset_type& offset, + const container_type& container, + const U32 virtual_offset) + { if(container.header.data_header.controller.encryption != YON_ENCRYPTION_NONE) - return(this->__writeContainerEncrypted(stream, offset, container, virtual_offset)); + return(this->WriteContainerEncrypted(stream, offset, container, virtual_offset)); - this->__updateHeader(offset, container, virtual_offset); + this->UpdateHeader(offset, container, virtual_offset); assert(container.buffer_data.size() == offset.data_header.cLength); stream << container; } @@ -310,48 +419,27 @@ class VariantBlock{ * @param container * @param virtual_offset */ - inline void __writeContainerEncrypted(std::ostream& stream, offset_type& offset, const container_type& container, const U32 virtual_offset){ - this->__updateHeader(offset, container, virtual_offset); + inline void WriteContainerEncrypted(std::ostream& stream, + offset_type& offset, + const container_type& container, + const U32 virtual_offset) + { + this->UpdateHeader(offset, container, virtual_offset); assert(container.buffer_data.size() == offset.data_header.eLength); // Encrypted data is concatenated: write only data buffer stream.write(container.buffer_data.data(), container.buffer_data.size()); } public: + uint16_t n_info_c_allocated; + uint16_t n_format_c_allocated; block_header_type header; block_footer_type footer; - permutation_type ppa_manager; - container_type meta_contig_container; - container_type meta_positions_container; - container_type meta_refalt_container; - container_type meta_controller_container; - container_type meta_quality_container; - container_type meta_names_container; - container_type meta_alleles_container; - container_type meta_info_map_ids; - container_type meta_format_map_ids; - container_type meta_filter_map_ids; - container_type gt_support_data_container; - container_type gt_rle8_container; - container_type gt_rle16_container; - container_type gt_rle32_container; - container_type gt_rle64_container; - container_type gt_simple8_container; - container_type gt_simple16_container; - container_type gt_simple32_container; - container_type gt_simple64_container; + container_type* base_containers; container_type* info_containers; container_type* format_containers; + yon_gt_ppa* gt_ppa; - // Use during construction - hash_container_type info_fields; - hash_container_type format_fields; - hash_container_type filter_fields; - hash_vector_container_type info_patterns; - hash_vector_container_type format_patterns; - hash_vector_container_type filter_patterns; - -public: // Utility U64 end_block_; U64 start_compressed_data_; diff --git a/lib/containers/variant_block_container.cpp b/lib/containers/variant_block_container.cpp index f9178b6..b0dac2c 100644 --- a/lib/containers/variant_block_container.cpp +++ b/lib/containers/variant_block_container.cpp @@ -3,195 +3,473 @@ namespace tachyon{ namespace containers{ -const std::vector VariantBlockContainer::get_info_field_pattern_matches(const std::string& field_name) const{ - core::HeaderMapEntry* match = nullptr; - int info_field_global_id = -2; - if(this->header_->getInfoField(field_name, match)){ - info_field_global_id = match->IDX; - } - - std::vector ret; - if(info_field_global_id >= 0){ - // Collect all matches - // Place in array - // 0 = false, 1 = true - S32 local_key = -1; - for(U32 i = 0; i < this->getBlock().footer.n_info_streams; ++i){ - if(this->getBlock().footer.info_offsets[i].data_header.global_key == info_field_global_id){ - local_key = i; +bool VariantBlockContainer::ParseSettings(block_settings_type& settings){ + // Clear previous information (if any). + this->info_id_global_loaded.clear(); + this->info_id_local_loaded.clear(); + this->info_map_global.clear(); + this->format_id_global_loaded.clear(); + this->format_id_local_loaded.clear(); + this->format_map_global.clear(); + + // Todo: if performing genotype annotating then we have to remove + // fields that are annotate by tachyon to prevent duplicates + // (e.g. AC or AF already existing). + std::unordered_map blocked_list; + if(settings.annotate_extra){ + for(U32 i = 0; i < YON_GT_ANNOTATE_FIELDS.size(); ++i){ + const YonInfo* info = this->header_->GetInfo(YON_GT_ANNOTATE_FIELDS[i]); + if(info != nullptr){ + blocked_list[info->idx] = YON_GT_ANNOTATE_FIELDS[i]; + //std::cerr << "Add to blocked list: " << YON_GT_ANNOTATE_FIELDS[i] << "@" << info->idx << std::endl; } } + } - if(local_key == -1){ - //std::cerr << "could not find local" << std::endl; - return ret; + // Parse Info. If all Info containers are loaded then we simply copy + // the order in which they occur. If we are provided with a vector + // of target global identifiers we have to first map these to the + // (possible) local identifiers. + if(settings.load_static & YON_BLK_BV_INFO){ + for(U32 i = 0; i < this->block_.footer.n_info_streams; ++i){ + const std::unordered_map::const_iterator it = blocked_list.find(this->block_.footer.info_offsets[i].data_header.global_key); + if(it == blocked_list.end()){ + //std::cerr << "adding not blocked" << std::endl; + this->info_id_local_loaded.push_back(i); + this->info_id_global_loaded.push_back(this->block_.footer.info_offsets[i].data_header.global_key); + this->info_map_global[this->info_id_global_loaded[i]] = i; + } else { + //std::cerr << "skipping blocked" << std::endl; + } + } + //std::cerr << this->info_id_local_loaded.size() << "," << this->info_id_global_loaded.size() << std::endl; + } else { + std::vector local_ids; + std::vector global_ids; + for(U32 i = 0; i < settings.info_id_global.size(); ++i){ + // Searches for the global Vcf:INFO idx value in the block. If + // it is found then return that local idx otherwise -1. If the + // idx is found store it in the loaded idx vector. + const std::unordered_map::const_iterator it = blocked_list.find(this->block_.footer.info_offsets[i].data_header.global_key); + if(it == blocked_list.end()){ + const int local = this->block_.GetInfoPosition(settings.info_id_global[i]); + if(local >= 0){ + local_ids.push_back(local); + global_ids.push_back(settings.info_id_global[i]); + } + } } - ret.resize(this->getBlock().footer.n_info_patterns, false); - for(U32 i = 0; i < this->getBlock().footer.n_info_patterns; ++i){ - //std::cerr << i << '\t' << this->getBlock().footer.info_bit_vectors[i][local_key] << std::endl; - ret[i] = this->getBlock().footer.info_bit_vectors[i][local_key]; + if(local_ids.size()){ + // Dedupe vectors. This prevents multiple parsings of the same + // target data container as this is illegal. + for(U32 i = 0; i < local_ids.size(); ++i){ + map_type::const_iterator it = this->info_map_global.find(global_ids[i]); + if(it == this->info_map_global.end()){ + this->info_id_local_loaded.push_back(local_ids[i]); + this->info_id_global_loaded.push_back(global_ids[i]); + this->info_map_global[global_ids[i]] = i; + } + } } } - return(ret); -} -const std::vector VariantBlockContainer::get_format_field_pattern_matches(const std::string& field_name) const{ - core::HeaderMapEntry* match = nullptr; - int format_field_global_id = -2; - if(this->header_->getFormatField(field_name, match)){ - format_field_global_id = match->IDX; - } + // Parse Format. If all Format containers are loaded then we simply copy + // the order in which they occur. If we are provided with a vector + // of target global identifiers we have to first map these to the + // (possible) local identifiers. + if(settings.load_static & YON_BLK_BV_FORMAT){ + for(U32 i = 0; i < this->block_.footer.n_format_streams; ++i){ + this->format_id_local_loaded.push_back(i); + this->format_id_global_loaded.push_back(this->block_.footer.format_offsets[i].data_header.global_key); + this->format_map_global[this->format_id_global_loaded[i]] = i; + } + } else { + std::vector local_ids; + std::vector global_ids; + for(U32 i = 0; i < settings.format_id_global.size(); ++i){ + // Searches for the global Vcf:FORMAT idx value in the block. If + // it is found then return that local idx otherwise -1. If the + // idx is found store it in the loaded idx vector. + const int local = this->block_.GetFormatPosition(settings.format_id_global[i]); + if(local >= 0){ + local_ids.push_back(local); + global_ids.push_back(settings.format_id_global[i]); + } + } - std::vector ret; - if(format_field_global_id >= 0){ - S32 local_key = -1; - for(U32 i = 0; i < this->getBlock().footer.n_format_streams; ++i){ - if(this->getBlock().footer.format_offsets[i].data_header.global_key == format_field_global_id){ - local_key = i; + if(local_ids.size()){ + // Dedupe vectors. This prevents multiple parsings of the same + // target data container as this is illegal. + for(U32 i = 0; i < local_ids.size(); ++i){ + map_type::const_iterator it = this->format_map_global.find(global_ids[i]); + if(it == this->format_map_global.end()){ + this->format_id_local_loaded.push_back(local_ids[i]); + this->format_id_global_loaded.push_back(global_ids[i]); + this->format_map_global[global_ids[i]] = i; + } } } + } - if(local_key == -1){ - //std::cerr << "could not find local" << std::endl; - return ret; + return(this->ParseLoadedPatterns(settings)); +} + +bool VariantBlockContainer::ParseLoadedPatterns(block_settings_type& settings){ + // Clear previous information (if any). + this->info_patterns_local.clear(); + this->format_patterns_local.clear(); + this->info_patterns_local.resize(this->block_.footer.n_info_patterns); + this->format_patterns_local.resize(this->block_.footer.n_format_patterns); + + // Iterate over Info patterns. + if(this->info_id_global_loaded.size()){ + // If all Vcf::INFO fields are desired then return them + // in the stored order to guarantee bit-exactness. Otherwise + // return in the order requested. + if((settings.load_static & YON_BLK_BV_INFO) && settings.annotate_extra == false){ + for(U32 p = 0; p < this->block_.footer.n_info_patterns; ++p){ + this->info_patterns_local[p] = this->block_.footer.info_patterns[p].pattern; + } + } else { // Return in requested order. + for(U32 p = 0; p < this->block_.footer.n_info_patterns; ++p){ + this->info_patterns_local[p] = this->block_.IntersectInfoPatterns(this->info_id_global_loaded, p); + } } + } - // Collect all matches - // Place in array - // 0 = false, 1 = true - ret.resize(this->getBlock().footer.n_format_patterns, false); - for(U32 i = 0; i < this->getBlock().footer.n_format_patterns; ++i){ - //std::cerr << i << '\t' << this->getBlock().index_entry.format_bit_vectors[i][local_format_field_id] << std::endl; - ret[i] = this->getBlock().footer.format_bit_vectors[i][local_key]; + // Iterate over Format patterns. + if(this->format_id_global_loaded.size()){ + if(settings.load_static & YON_BLK_BV_FORMAT){ + // If all Vcf::FORMAT fields are desired then return them + // in the stored order to guarantee bit-exactness. Otherwise + // return in the order requested. + for(U32 p = 0; p < this->block_.footer.n_format_patterns; ++p){ + this->format_patterns_local[p] = this->block_.footer.format_patterns[p].pattern; + } + } else { + for(U32 p = 0; p < this->block_.footer.n_format_patterns; ++p){ + this->format_patterns_local[p] = this->block_.IntersectFormatPatterns(this->format_id_global_loaded, p); + } } } - return(ret); -} -bool VariantBlockContainer::readBlock(std::ifstream& stream, block_settings_type& settings){ - // Get info and format keys - std::vector info_keys, format_keys; - if(settings.info_ID_list.size()) info_keys = this->block_.intersectInfoKeys(settings.info_ID_list); - else info_keys = this->block_.getInfoKeys(); - if(settings.format_ID_list.size()) format_keys = this->block_.intersectFormatKeys(settings.format_ID_list); - else format_keys = this->block_.getFormatKeys(); + return true; +} - if(this->buildMapper(info_keys, format_keys, settings) == false) - return false; +bool VariantBlockContainer::ReadBlock(std::ifstream& stream, block_settings_type& settings){ + // Allocate memory for the Format and Info containers. + // Info containers. + delete [] this->block_.info_containers; + this->block_.info_containers = new VariantBlock::container_type[this->block_.footer.n_info_streams]; + this->block_.n_info_c_allocated = this->block_.footer.n_info_streams; + // Format containers. + delete [] this->block_.format_containers; + this->block_.format_containers = new VariantBlock::container_type[this->block_.footer.n_format_streams]; + this->block_.n_format_c_allocated = this->block_.footer.n_format_streams; - // Todo: ascertain random access order is guaranteed + // Interpret the user-specified block-settings if any. This step converts + // global index offset values into local offsets and computes new pattern + // vectors if required. The ordering of the values are according to the + // input sequence not according to the actual stored order. + this->ParseSettings(settings); - if(settings.ppa.load){ + // Load the FORMAT:GT (GBPBWT) permutation array. + if(settings.load_static & YON_BLK_BV_PPA){ + // If there is FORMAT:GT field data available AND that data has + // been permuted then create a new yon_gt_ppa object to store + // this data. if(this->block_.header.controller.hasGTPermuted && this->block_.header.controller.hasGT){ - this->block_.ppa_manager.header = this->block_.footer.offset_ppa; - stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.offset_ppa.data_header.offset); - stream >> this->block_.ppa_manager; + stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.offsets[YON_BLK_PPA].data_header.offset); + this->block_.LoadContainerSeek(stream, + this->block_.footer.offsets[YON_BLK_PPA], + this->block_.base_containers[YON_BLK_PPA]); + + this->block_.gt_ppa = new yon_gt_ppa; + this->block_.gt_ppa->n_samples = this->header_->GetNumberSamples(); } } - if(settings.contig.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_meta_contig, this->block_.meta_contig_container); + // Load base meta containers. + for(U32 i = YON_BLK_CONTIG; i < YON_BLK_GT_INT8; ++i){ + if(settings.load_static & (1 << i)){ + this->block_.LoadContainerSeek(stream, + this->block_.footer.offsets[i], + this->block_.base_containers[i]); + } } - if(settings.positions.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_meta_position, this->block_.meta_positions_container); + // Load genotype containers. At the moment, genotype containers cannot be loaded + // individually by using this wrapper routine. If you wish to load these separately + // you will have to do so manually. + if((settings.load_static & YON_BLK_BV_GT) || (settings.load_static & YON_BLK_BV_FORMAT)){ + this->loaded_genotypes = true; + this->block_.LoadContainerSeek(stream, this->block_.footer.offsets[YON_BLK_GT_INT8], this->block_.base_containers[YON_BLK_GT_INT8]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_INT16], this->block_.base_containers[YON_BLK_GT_INT16]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_INT32], this->block_.base_containers[YON_BLK_GT_INT32]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_INT64], this->block_.base_containers[YON_BLK_GT_INT64]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_S_INT8], this->block_.base_containers[YON_BLK_GT_S_INT8]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_S_INT16], this->block_.base_containers[YON_BLK_GT_S_INT16]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_S_INT32], this->block_.base_containers[YON_BLK_GT_S_INT32]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_S_INT64], this->block_.base_containers[YON_BLK_GT_S_INT64]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_N_INT8], this->block_.base_containers[YON_BLK_GT_N_INT8]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_N_INT16], this->block_.base_containers[YON_BLK_GT_N_INT16]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_N_INT32], this->block_.base_containers[YON_BLK_GT_N_INT32]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_N_INT64], this->block_.base_containers[YON_BLK_GT_N_INT64]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_SUPPORT], this->block_.base_containers[YON_BLK_GT_SUPPORT]); + this->block_.LoadContainer(stream, this->block_.footer.offsets[YON_BLK_GT_PLOIDY], this->block_.base_containers[YON_BLK_GT_PLOIDY]); } - if(settings.controller.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_meta_controllers, this->block_.meta_controller_container); - } + // Load Info containers. Technically there is no difference between the two + // conditions below in terms of outcome. However, the first case guarantees + // that data is loaded linearly from disk as this can be guaranteed when loading + // all available data. There is no such guarntees for the second case. + if(this->block_.footer.n_info_streams && (settings.load_static & YON_BLK_BV_INFO) && settings.annotate_extra == false){ + stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.info_offsets[0].data_header.offset); - if(settings.quality.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_meta_quality, this->block_.meta_quality_container); + for(U32 i = 0; i < this->block_.footer.n_info_streams; ++i){ + this->block_.LoadContainer(stream, + this->block_.footer.info_offsets[i], + this->block_.info_containers[i]); + } } + // If we have a user-supplied list of identifiers parsed above. + else { + for(U32 i = 0; i < this->info_id_local_loaded.size(); ++i){ + this->block_.LoadContainerSeek(stream, + this->block_.footer.info_offsets[this->info_id_local_loaded[i]], + this->block_.info_containers[this->info_id_local_loaded[i]]); + } - if(settings.names.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_meta_names, this->block_.meta_names_container); } - if(settings.alleles.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_meta_refalt, this->block_.meta_refalt_container); - this->block_.__loadContainer(stream, this->block_.footer.offset_meta_alleles, this->block_.meta_alleles_container); + // Load Format containers. Technically there is no difference between the two + // conditions below in terms of outcome. However, the first case guarantees + // that data is loaded linearly from disk as this can be guaranteed when loading + // all available data. There is no such guarntees for the second case. + if(this->block_.footer.n_format_streams && (settings.load_static & YON_BLK_BV_FORMAT)){ + stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.format_offsets[0].data_header.offset); + for(U32 i = 0; i < this->block_.footer.n_format_streams; ++i){ + this->block_.LoadContainerSeek(stream, this->block_.footer.format_offsets[i], this->block_.format_containers[i]); + } + // At this point the stream should be located at the end-of-block + // marker as the Format information is stored last. + assert(this->block_.end_compressed_data_ == (U64)stream.tellg()); } - - if(settings.genotypes_rle.load || settings.genotypes_all.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_gt_8b, this->block_.gt_rle8_container); - this->block_.__loadContainer(stream, this->block_.footer.offset_gt_16b, this->block_.gt_rle16_container); - this->block_.__loadContainer(stream, this->block_.footer.offset_gt_32b, this->block_.gt_rle32_container); - this->block_.__loadContainer(stream, this->block_.footer.offset_gt_64b, this->block_.gt_rle64_container); + // If we have a user-supplied list of identifiers parsed above. + else { + for(U32 i = 0; i < this->format_id_local_loaded.size(); ++i){ + this->block_.LoadContainerSeek(stream, this->block_.footer.format_offsets[this->format_id_local_loaded[i]], this->block_.format_containers[this->format_id_local_loaded[i]]); + } } - if(settings.genotypes_simple.load || settings.genotypes_all.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_gt_simple8, this->block_.gt_simple8_container); - this->block_.__loadContainer(stream, this->block_.footer.offset_gt_simple16, this->block_.gt_simple16_container); - this->block_.__loadContainer(stream, this->block_.footer.offset_gt_simple32, this->block_.gt_simple32_container); - this->block_.__loadContainer(stream, this->block_.footer.offset_gt_simple64, this->block_.gt_simple64_container); - } + // Seek to end-of-block position. + stream.seekg(this->block_.end_block_); + return(true); +} + +VariantReaderObjects* VariantBlockContainer::LoadObjects(objects_type* objects, block_settings_type& block_settings){ + // Construct a new high-level meta container using all available loaded + // core meta data container. + objects->meta_container = new meta_container_type(this->GetBlock()); - if(settings.genotypes_support.load || settings.genotypes_all.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_gt_helper, this->block_.gt_support_data_container); + // If the block has genotypes in it and they have been loaded we can construct + // a high-level genotype container and transform the multifarious internal + // encodings into a unified framework. + if(this->HasGenotypes() && (block_settings.load_static & YON_BLK_BV_GT)){ + objects->loaded_genotypes = true; + objects->genotype_container = new gt_container_type(this->GetBlock(), *objects->meta_container); + // Genotype summary object is only allocated when required. + objects->genotype_summary = nullptr; } - if(settings.set_membership.load || settings.genotypes_all.load){ - this->block_.__loadContainerSeek(stream, this->block_.footer.offset_meta_info_id, this->block_.meta_info_map_ids); - this->block_.__loadContainer(stream, this->block_.footer.offset_meta_filter_id, this->block_.meta_filter_map_ids); - this->block_.__loadContainer(stream, this->block_.footer.offset_meta_format_id, this->block_.meta_format_map_ids); + // Format-specific containers. These have to be allocated as double pointers + // to avoid memory collisions because they have different intrinsic class members + // even though they share the same data interface. + objects->n_loaded_format = this->format_id_global_loaded.size(); + objects->format_containers = new format_interface_type*[this->block_.footer.n_format_streams]; + + // Handle Vcf:FORMAT fields. + if(objects->n_loaded_format){ + for(U32 i = 0; i < objects->n_loaded_format; ++i){ + //const U32 global_key = this->block_.footer.format_offsets[i].getGlobalKey(); + const U32 global_key = this->format_id_global_loaded[i]; + const U32 local_key = this->format_id_local_loaded[i]; + objects->format_id_loaded.push_back(local_key); + + // Evaluate the set-membership of a given global key in the available Format patterns + // described in the data container footer. + std::vector matches = this->block_.FormatPatternSetMembership(global_key); + + if(this->header_->format_fields_[global_key].yon_type == YON_VCF_HEADER_INTEGER){ + objects->format_containers[local_key] = new containers::FormatContainer(this->GetBlock().format_containers[local_key], + *objects->meta_container, + matches, + this->header_->GetNumberSamples()); + + } else if(this->header_->format_fields_[global_key].yon_type == YON_VCF_HEADER_STRING || + this->header_->format_fields_[global_key].yon_type == YON_VCF_HEADER_CHARACTER) + { + objects->format_containers[local_key] = new containers::FormatContainer(this->GetBlock().format_containers[local_key], + *objects->meta_container, + matches, + this->header_->GetNumberSamples()); + } else if(this->header_->format_fields_[global_key].yon_type == YON_VCF_HEADER_FLOAT){ + objects->format_containers[local_key] = new containers::FormatContainer(this->GetBlock().format_containers[local_key], + *objects->meta_container, + matches, + this->header_->GetNumberSamples()); + } else if(this->header_->format_fields_[global_key].yon_type == YON_VCF_HEADER_FLAG){ + std::cerr << utility::timestamp("ERROR") << "Format fields cannot have FLAG fields." << std::endl; + exit(1); + } + objects->format_container_map[this->header_->format_fields_[global_key].id] = objects->format_containers[local_key]; + } } - // Load all info - if(settings.info_all.load && this->block_.footer.n_info_streams){ - stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.info_offsets[0].data_header.offset); + // Info-specific containers. These have to be allocated as double pointers + // to avoid memory collisions because they have different intrinsic class members + // even though they share the same data interface. + objects->n_loaded_info = this->info_id_global_loaded.size(); + objects->info_containers = new info_interface_type*[this->block_.footer.n_info_streams]; - this->mapper_.info_container_loaded_.resize(this->block_.footer.n_info_streams); - for(U32 i = 0; i < this->block_.footer.n_info_streams; ++i){ - this->block_.__loadContainer(stream, this->block_.footer.info_offsets[i], this->block_.info_containers[i]); - this->mapper_.info_container_loaded_.at(i)(i, i, this->block_.footer.info_offsets[i].data_header.global_key, &this->block_.footer.info_offsets[i]); - } - } - // If we have supplied a list of identifiers - else if(settings.info_ID_list.size()){ - this->mapper_.info_container_loaded_.resize(info_keys.size()); - // Ascertain that random access is linearly forward - for(U32 i = 0; i < info_keys.size(); ++i){ - stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.info_offsets[this->mapper_.info_container_global_[info_keys[i]].stream_id_local].data_header.offset); - if(!stream.good()){ - std::cerr << utility::timestamp("ERROR","IO") << "Failed to seek for INFO field in block!" << std::endl; - return false; + // Handle Vcf:INFO fields. + if(objects->n_loaded_info){ + for(U32 i = 0; i < objects->n_loaded_info; ++i){ + //const U32 global_key = this->block_.footer.info_offsets[i].getGlobalKey(); + const U32 global_key = this->info_id_global_loaded[i]; + const U32 local_key = this->info_id_local_loaded[i]; + objects->info_id_loaded.push_back(local_key); + + // Evaluate the set-membership of a given global key in the available Info patterns + // described in the data container footer. + std::vector matches = this->block_.InfoPatternSetMembership(global_key); + + if(this->header_->info_fields_[global_key].yon_type == YON_VCF_HEADER_INTEGER){ + objects->info_containers[local_key] = new containers::InfoContainer(this->GetBlock().info_containers[local_key], *objects->meta_container, matches); + } else if(this->header_->info_fields_[global_key].yon_type == YON_VCF_HEADER_STRING || + this->header_->info_fields_[global_key].yon_type == YON_VCF_HEADER_CHARACTER) + { + objects->info_containers[local_key] = new containers::InfoContainer(this->GetBlock().info_containers[local_key], *objects->meta_container, matches); + } else if(this->header_->info_fields_[global_key].yon_type == YON_VCF_HEADER_FLOAT){ + objects->info_containers[local_key] = new containers::InfoContainer(this->GetBlock().info_containers[local_key], *objects->meta_container, matches); + } else { + objects->info_containers[local_key] = new containers::InfoContainer(); } + objects->info_container_map[this->header_->info_fields_[global_key].id] = objects->info_containers[local_key]; + } + } + + return(objects); +} + +yon1_t* VariantBlockContainer::LazyEvaluate(objects_type& objects){ + // Lazy evaluation of interleaved data. + yon1_t* records = new yon1_t[objects.meta_container->size()]; + + // Iterate over the sites described in the meta container. + for(U32 i = 0; i < objects.meta_container->size(); ++i){ + records[i].is_dirty = false; + records[i].is_loaded_meta = true; // todo + records[i].is_loaded_gt = objects.loaded_genotypes; + records[i].id_block = i; + records[i].meta = &objects.meta_container->at(i); - this->block_.__loadContainer(stream, this->block_.footer.info_offsets[this->mapper_.info_container_global_[info_keys[i]].stream_id_local], this->block_.info_containers[i]); - this->mapper_.info_container_loaded_.at(i)(i, this->mapper_.info_container_global_[info_keys[i]].stream_id_local, info_keys[i], &this->block_.footer.info_offsets[this->mapper_.info_container_global_[info_keys[i]].stream_id_local]); + // Check if genotype data has been loaded then checks + // if this variant site has any genotypes set. + if(records[i].is_loaded_gt && records[i].meta->HasGT()){ + records[i].gt_i = &objects.genotype_container->at(i); + + if(this->block_.header.controller.hasGTPermuted) + records[i].gt = records[i].gt_i->GetObjects(*this->block_.gt_ppa); + else + records[i].gt = records[i].gt_i->GetObjects(this->header_->GetNumberSamples()); + + records[i].gt->Evaluate(); + //records[i].gt->Expand(); } - } // end case load_info_ID - // Load all FORMAT data - if(settings.format_all.load && this->block_.footer.n_format_streams){ - stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.format_offsets[0].data_header.offset); - this->mapper_.format_container_loaded_.resize(this->block_.footer.n_format_streams); - for(U32 i = 0; i < this->block_.footer.n_format_streams; ++i){ - this->block_.__loadContainer(stream, this->block_.footer.format_offsets[i], this->block_.format_containers[i]); - this->mapper_.format_container_loaded_.at(i)(i, i, this->block_.footer.format_offsets[i].data_header.global_key, &this->block_.footer.format_offsets[i]); + if(objects.n_loaded_info){ + records[i].n_info = objects.n_loaded_info; + records[i].info_ids = &this->info_patterns_local[objects.meta_container->at(i).info_pattern_id]; + records[i].info = new PrimitiveContainerInterface*[this->block_.footer.n_info_streams]; + records[i].info_containers = new InfoContainerInterface*[this->block_.footer.n_info_streams]; + + // Populate Info data. + for(U32 j = 0; j < records[i].info_ids->size(); ++j){ + InfoContainerInterface* info_cnt = objects.info_container_map[this->header_->info_fields_[records[i].info_ids->at(j)].id]; + records[i].info_containers[j] = info_cnt; + records[i].info_hdr.push_back(&this->header_->info_fields_[records[i].info_ids->at(j)]); + + //std::cerr << "INFO " << j << "/" << records[i].info_ids->size() << ": target: " << this->header_->info_fields_[records[i].info_ids->at(j)].id << " container entries = " << info_cnt->size() << std::endl; + + switch(this->header_->info_fields_[records[i].info_ids->at(j)].yon_type){ + case(YON_VCF_HEADER_INTEGER): + records[i].info[j] = &reinterpret_cast*>(info_cnt)->at(i); + break; + case(YON_VCF_HEADER_FLOAT): + records[i].info[j] = &reinterpret_cast*>(info_cnt)->at(i); + break; + case(YON_VCF_HEADER_STRING): + case(YON_VCF_HEADER_CHARACTER): + records[i].info[j] = &reinterpret_cast*>(info_cnt)->at(i); + break; + default: + records[i].info[j] = &reinterpret_cast*>(info_cnt)->at(0); + break; + } + } } - assert(this->block_.end_compressed_data_ == (U64)stream.tellg()); - } // If we have supplied a list of identifiers - else if(settings.format_ID_list.size()){ - this->mapper_.format_container_loaded_.resize(format_keys.size()); - // Ascertain that random access is linearly forward - for(U32 i = 0; i < format_keys.size(); ++i){ - stream.seekg(this->block_.start_compressed_data_ + this->block_.footer.format_offsets[this->mapper_.format_container_global_[format_keys[i]].stream_id_local].data_header.offset); - if(!stream.good()){ - std::cerr << utility::timestamp("ERROR","IO") << "Failed to seek for FORMAT field in block!" << std::endl; - return false; + + if(objects.n_loaded_format){ + records[i].n_format = objects.n_loaded_format; + records[i].format_ids = &this->format_patterns_local[objects.meta_container->at(i).format_pattern_id]; + records[i].fmt = new PrimitiveGroupContainerInterface*[this->block_.footer.n_format_streams]; + records[i].format_containers = new FormatContainerInterface*[this->block_.footer.n_format_streams]; + + // Populate Format data. + for(U32 j = 0; j < records[i].format_ids->size(); ++j){ + FormatContainerInterface* fmt_cnt = objects.format_container_map[this->header_->format_fields_[records[i].format_ids->at(j)].id]; + records[i].format_containers[j] = fmt_cnt; + records[i].format_hdr.push_back(&this->header_->format_fields_[records[i].format_ids->at(j)]); + + //std::cerr << "FORMAT " << j << "/" << records[i].format_ids->size() << ": target: " << records[i].format_hdr[j]->id << " container entries = " << fmt_cnt->size() << std::endl; + + switch(this->header_->format_fields_[records[i].format_ids->at(j)].yon_type){ + case(YON_VCF_HEADER_INTEGER): + records[i].fmt[j] = &reinterpret_cast*>(fmt_cnt)->at(i); + break; + case(YON_VCF_HEADER_FLOAT): + records[i].fmt[j] = &reinterpret_cast*>(fmt_cnt)->at(i); + break; + case(YON_VCF_HEADER_STRING): + case(YON_VCF_HEADER_CHARACTER): + records[i].fmt[j] = &reinterpret_cast*>(fmt_cnt)->at(i); + break; + default: + std::cerr << "fmt at default: illegal primitive in assignment" << std::endl; + records[i].fmt[j] = &reinterpret_cast*>(fmt_cnt)->at(0); + break; + } } + } - this->block_.__loadContainer(stream, this->block_.footer.format_offsets[this->mapper_.format_container_global_[format_keys[i]].stream_id_local], this->block_.format_containers[i]); - this->mapper_.format_container_loaded_.at(i)(i, this->mapper_.format_container_global_[format_keys[i]].stream_id_local, format_keys[i], &this->block_.footer.format_offsets[this->mapper_.format_container_global_[format_keys[i]].stream_id_local]); + // Todo: These should interpreted during loading as Info and Format + // fields are. + if(this->block_.footer.n_filter_streams){ + records[i].n_filter = this->block_.footer.n_filter_streams; + + // Populate Filter. + records[i].filter_ids = &this->block_.footer.filter_patterns[objects.meta_container->at(i).filter_pattern_id].pattern; + for(U32 j = 0; j < records[i].filter_ids->size(); ++j){ + records[i].filter_hdr.push_back(&this->header_->filter_fields_[records[i].filter_ids->at(j)]); + } } - } // end case load_info_ID - stream.seekg(this->block_.end_block_); // seek to end-of-block - return(true); + // Pointer to this variant block container. + records[i].parent_container = this; + } + return(records); } } diff --git a/lib/containers/variant_block_container.h b/lib/containers/variant_block_container.h index 892f560..c2c6cc5 100644 --- a/lib/containers/variant_block_container.h +++ b/lib/containers/variant_block_container.h @@ -9,8 +9,8 @@ #include "containers/format_container.h" #include "containers/format_container_string.h" #include "containers/interval_container.h" -#include "containers/hash_container.h" -#include "variant_block_mapper.h" +#include "core/variant_reader_objects.h" +#include "core/variant_record.h" namespace tachyon { namespace containers { @@ -27,52 +27,49 @@ class VariantBlockContainer { typedef VariantBlock block_type; typedef VariantBlockHeader block_header_type; typedef VariantBlockFooter block_footer_type; - typedef VariantBlockMapper block_mapper_type; - typedef core::VariantHeader header_type; + typedef VariantHeader global_header_type; typedef containers::VariantBlock block_entry_type; typedef containers::MetaContainer meta_container_type; typedef containers::GenotypeContainer gt_container_type; typedef containers::InfoContainerInterface info_interface_type; typedef containers::FormatContainerInterface format_interface_type; - typedef containers::GenotypeSummary genotype_summary_type; typedef containers::IntervalContainer interval_container_type; - typedef HashContainer hash_container_type; typedef DataBlockSettings block_settings_type; + typedef VariantReaderObjects objects_type; + + typedef std::unordered_map map_type; public: VariantBlockContainer() : - header_(nullptr) + loaded_genotypes(false), + header_(nullptr), + gt_exp(nullptr) { } - VariantBlockContainer(const header_type& header) : - mapper_(header.header_magic.n_format_values, header.header_magic.n_info_values), - header_(&header) + VariantBlockContainer(const global_header_type& header) : + loaded_genotypes(false), + header_(&header), + gt_exp(nullptr) { - + delete [] this->gt_exp; } - ~VariantBlockContainer(void){} + ~VariantBlockContainer(void){ + } - self_type& operator<<(const header_type& header){ + self_type& operator<<(const global_header_type& header){ this->header_ = &header; - this->mapper_ << header; return(*this); } void reset(void){ this->block_.clear(); - // Todo: reset hashes } - bool buildMapper(const std::vector& info_keys, const std::vector& format_keys, const block_settings_type& block_settings){ - if(this->mapper_.build(info_keys, format_keys, this->block_.footer) == false){ - std::cerr << utility::timestamp("ERROR") << "Failed to build mapper..." << std::endl; - return false; - } - return(true); - } + bool ParseSettings(block_settings_type& settings); + bool ParseLoadedPatterns(block_settings_type& settings); /**< @brief Reads one or more separate digital objects from disk * Primary function for reading partial data from disk. Data @@ -81,23 +78,7 @@ class VariantBlockContainer { * @param settings Settings record describing reading parameters * @return Returns FALSE if there was a problem, TRUE otherwise */ - bool readBlock(std::ifstream& stream, block_settings_type& settings); - - /**< - * Calculates which INFO pattern matches are found for the given field - * name in the current loaded block. - * @param field_name INFO field name - * @return Returns a vector of booleans representing pattern matches - */ - const std::vector get_info_field_pattern_matches(const std::string& field_name) const; - - /**< - * Calculates which FORMAT pattern matches are found for the given field - * name in the current loaded block. - * @param field_name FORMAT field name - * @return Returns a vector of booleans representing pattern matches - */ - const std::vector get_format_field_pattern_matches(const std::string& field_name) const; + bool ReadBlock(std::ifstream& stream, block_settings_type& settings); /**< * Factory function for FORMAT container given an input `field` name @@ -105,20 +86,7 @@ class VariantBlockContainer { * @return Returns an instance of a `FormatContainer` if successful or a nullpointer otherwise */ template - containers::FormatContainer* get_format_container(const std::string& field_name) const{ - core::HeaderMapEntry* match = nullptr; - int format_field_global_id = -2; - if(this->header_->getFormatField(field_name, match)){ - format_field_global_id = match->IDX; - } else return nullptr; - - if(format_field_global_id >= 0){ - const S32& target_local_id = this->getMapper().getGlobalFormat(format_field_global_id).load_order_index; - if(target_local_id < 0) return nullptr; - return(new containers::FormatContainer(this->getBlock().format_containers[target_local_id], this->header_->getSampleNumber())); - } - else return nullptr; - } + containers::FormatContainer* get_format_container(const std::string& field_name) const; /**< * Factory function for balanced FORMAT container given an input `field` name. @@ -129,32 +97,7 @@ class VariantBlockContainer { * @return Returns an instance of a balanced `FormatContainer` if successful or a nullpointer otherwise */ template - containers::FormatContainer* get_balanced_format_container(const std::string& field_name, const containers::MetaContainer& meta_container) const{ - if(meta_container.size() == 0) - return new containers::FormatContainer(); - - core::HeaderMapEntry* match = nullptr; - int format_field_global_id = -2; - if(this->header_->getFormatField(field_name, match)){ - format_field_global_id = match->IDX; - } else return nullptr; - - if(format_field_global_id >= 0){ - const std::vector pattern_matches = this->get_format_field_pattern_matches(field_name); - U32 matches = 0; - for(U32 i = 0; i < pattern_matches.size(); ++i) - matches += pattern_matches[i]; - - if(matches == 0) - return nullptr; - - const S32& target_local_id = this->getMapper().getGlobalFormat(format_field_global_id).load_order_index; - if(target_local_id < 0) return nullptr; - - return(new containers::FormatContainer(this->getBlock().format_containers[target_local_id], meta_container, pattern_matches, this->header_->getSampleNumber())); - } - else return nullptr; - } + containers::FormatContainer* get_balanced_format_container(const std::string& field_name, const containers::MetaContainer& meta_container) const; /**< * Factory function for INFO container given an input `field` name @@ -162,20 +105,7 @@ class VariantBlockContainer { * @return Returns an instance of a `InfoContainer` if successful or a nullpointer otherwise */ template - containers::InfoContainer* get_info_container(const std::string& field_name) const{ - core::HeaderMapEntry* match = nullptr; - int info_field_global_id = -2; - if(this->header_->getInfoField(field_name, match)){ - info_field_global_id = match->IDX; - } else return nullptr; - - if(info_field_global_id >= 0){ - const S32& target_local_id = this->getMapper().getGlobalInfo(info_field_global_id).load_order_index; - if(target_local_id < 0) return nullptr; - return(new containers::InfoContainer(this->getBlock().info_containers[target_local_id])); - } - else return nullptr; - } + containers::InfoContainer* get_info_container(const std::string& field_name) const; /**< * Factory function for balanced INFO container given an input `field` name. @@ -186,53 +116,136 @@ class VariantBlockContainer { * @return Returns an instance of a balanced `InfoContainer` if successful or a nullpointer otherwise */ template - containers::InfoContainer* get_balanced_info_container(const std::string& field_name, const containers::MetaContainer& meta_container) const{ - if(meta_container.size() == 0) - return new containers::InfoContainer(); + containers::InfoContainer* get_balanced_info_container(const std::string& field_name, const containers::MetaContainer& meta_container) const; - core::HeaderMapEntry* match = nullptr; - int info_field_global_id = -2; - if(this->header_->getInfoField(field_name, match)){ - info_field_global_id = match->IDX; - } else return nullptr; - - if(info_field_global_id >= 0){ - const std::vector pattern_matches = this->get_info_field_pattern_matches(field_name); + // Accessors + inline block_type& GetBlock(void){ return(this->block_); } + inline const block_type& GetBlock(void) const{ return(this->block_); } - U32 matches = 0; - for(U32 i = 0; i < pattern_matches.size(); ++i) - matches += pattern_matches[i]; + // Checkers + inline bool AnyEncrypted(void) const{ return(this->block_.header.controller.anyEncrypted); } + inline bool HasGenotypes(void) const{ return(this->block_.header.controller.hasGT); } + inline bool HasPermutedGenotypes(void) const{ return(this->block_.header.controller.hasGTPermuted); } - if(matches == 0) - return nullptr; + inline void AllocateGenotypeMemory(void){ this->gt_exp = new yon_gt_rcd*[this->header_->GetNumberSamples()]; } + inline yon_gt_rcd** GetAllocatedGenotypeMemory(void){ return(this->gt_exp); } + inline yon_gt_rcd** GetAllocatedGenotypeMemory(void) const{ return(this->gt_exp); } - const S32& target_local_id = this->getMapper().getGlobalInfo(info_field_global_id).load_order_index; - if(target_local_id < 0) return nullptr; + /**< + * Primary construction function for generating the appropriate instances of + * iterators / containers + * @param objects Target objects + * @return Returns reference to input target objects + */ + objects_type* LoadObjects(objects_type* objects, block_settings_type& block_settings); - return(new containers::InfoContainer(this->getBlock().info_containers[target_local_id], meta_container, pattern_matches)); - } - else return nullptr; + objects_type* LoadObjects(block_settings_type& block_settings){ + objects_type* obj = new objects_type(); + return(this->LoadObjects(obj, block_settings)); } - // Accessors - inline block_type& getBlock(void){ return(this->block_); } - inline const block_type& getBlock(void) const{ return(this->block_); } - inline block_mapper_type& getMapper(void){ return(this->mapper_); } - inline const block_mapper_type& getMapper(void) const{ return(this->mapper_); } - - // Checkers - inline const bool anyEncrypted(void) const{ return(this->block_.header.controller.anyEncrypted); } - inline const bool hasGenotypes(void) const{ return(this->block_.header.controller.hasGT); } - inline const bool hasPermutedGenotypes(void) const{ return(this->block_.header.controller.hasGTPermuted); } + yon1_t* LazyEvaluate(objects_type& objects); private: - block_mapper_type mapper_; // global -> local, local -> global, loaded or not, primitive type - block_type block_; - hash_container_type h_tables_format_; - hash_container_type h_tables_info_; - const header_type* header_; + bool loaded_genotypes; + block_type block_; + const global_header_type* header_; + std::vector info_id_local_loaded; + std::vector format_id_local_loaded; + std::vector info_id_global_loaded; + std::vector format_id_global_loaded; + std::vector< std::vector > info_patterns_local; + std::vector< std::vector > format_patterns_local; + map_type info_map_global; + map_type format_map_global; + // External memory allocation for linear use of lazy-evaluated + // expansion of genotype records. This is critical when the sample + // numbers are becoming large as allocating/deallocating hundreds + // of thousands of pointers for every variant is very time consuming. + yon_gt_rcd** gt_exp; }; + +// IMPLEMENTATION ------------------------------------------------------------- + + + +template +containers::FormatContainer* VariantBlockContainer::get_format_container(const std::string& field_name) const{ + int format_field_global_id = -2; + YonFormat* fmt = this->header_->GetFormat(field_name); + if(fmt != nullptr){ + format_field_global_id = fmt->idx; + } else return new containers::FormatContainer(); + + DataContainer* container = this->block_.GetFormatContainer(format_field_global_id); + if(container == nullptr) new containers::FormatContainer(); + + return(new containers::FormatContainer(*container, this->header_->GetNumberSamples())); +} + +template +containers::FormatContainer* VariantBlockContainer::get_balanced_format_container(const std::string& field_name, const containers::MetaContainer& meta_container) const{ + if(meta_container.size() == 0) + return new containers::FormatContainer(); + + int format_field_global_id = -2; + YonFormat* fmt = this->header_->GetFormat(field_name); + if(fmt != nullptr){ + format_field_global_id = fmt->idx; + } else return new containers::FormatContainer(); + + DataContainer* container = this->block_.GetFormatContainer(format_field_global_id); + if(container == nullptr) new containers::FormatContainer(); + + const std::vector pattern_matches = this->block_.FormatPatternSetMembership(format_field_global_id); + + U32 matches = 0; + for(U32 i = 0; i < pattern_matches.size(); ++i) + matches += pattern_matches[i]; + + if(matches == 0) return new containers::FormatContainer(); + + return(new containers::FormatContainer(*container, meta_container, pattern_matches, this->header_->GetNumberSamples())); + +} + +template +containers::InfoContainer* VariantBlockContainer::get_info_container(const std::string& field_name) const{ + int info_field_global_id = -2; + YonInfo* info = this->header_->GetInfo(field_name); + if(info != nullptr){ + info_field_global_id = info->idx; + } else return new containers::InfoContainer(); + + DataContainer* container = this->block_.GetInfoContainer(info_field_global_id); + if(container == nullptr) new containers::InfoContainer(); + + return(new containers::InfoContainer(*container)); +} + +template +containers::InfoContainer* VariantBlockContainer::get_balanced_info_container(const std::string& field_name, const containers::MetaContainer& meta_container) const{ + int info_field_global_id = -2; + YonInfo* info = this->header_->GetInfo(field_name); + if(info != nullptr){ + info_field_global_id = info->idx; + } else return new containers::InfoContainer(); + + DataContainer* container = this->block_.GetInfoContainer(info_field_global_id); + if(container == nullptr) new containers::InfoContainer(); + + const std::vector pattern_matches = this->block_.InfoPatternSetMembership(info_field_global_id); + + U32 matches = 0; + for(U32 i = 0; i < pattern_matches.size(); ++i) + matches += pattern_matches[i]; + + if(matches == 0) return new containers::InfoContainer(); + + return(new containers::InfoContainer(*container, meta_container, pattern_matches)); +} + } } diff --git a/lib/containers/variant_block_mapper.cpp b/lib/containers/variant_block_mapper.cpp deleted file mode 100644 index a327aaf..0000000 --- a/lib/containers/variant_block_mapper.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include "variant_block_mapper.h" - -namespace tachyon{ -namespace containers{ - -VariantBlockMapper::VariantBlockMapper() : - n_format_fields(0), - n_info_fields(0) -{ -} - -VariantBlockMapper::VariantBlockMapper(const size_t n_format_fields, const size_t n_info_fields) : - n_format_fields(n_format_fields), - n_info_fields(n_info_fields) -{ -} - -VariantBlockMapper::~VariantBlockMapper(){} - -bool VariantBlockMapper::build(const block_footer_type& block_footer){ - this->format_container_global_.clear(); - this->format_container_global_.resize(this->n_format_fields); - this->format_container_local_.clear(); - this->format_container_local_.resize(block_footer.n_format_streams); - - for(U32 i = 0; i < block_footer.n_format_streams; ++i){ - // Set global -> local mapping -> loaded mapping - this->format_container_global_[block_footer.format_offsets[i].data_header.global_key](i, i, block_footer.format_offsets[i].data_header.global_key, &block_footer.format_offsets[i]); - // Set local -> global mapping -> loaded mapping - this->format_container_local_[i](i, i, block_footer.format_offsets[i].data_header.global_key, &block_footer.format_offsets[i]); - } - - this->info_container_global_.clear(); - this->info_container_global_.resize(this->n_info_fields); - this->info_container_local_.clear(); - this->info_container_local_.resize(block_footer.n_info_streams); - - for(U32 i = 0; i < block_footer.n_info_streams; ++i){ - // Set global -> local mapping -> loaded mapping - this->info_container_global_[block_footer.info_offsets[i].data_header.global_key](i, i, block_footer.info_offsets[i].data_header.global_key, &block_footer.info_offsets[i]); - // Set local -> global mapping -> loaded mapping - this->info_container_local_[i](i, i, block_footer.info_offsets[i].data_header.global_key, &block_footer.info_offsets[i]); - } - - - return true; -} - -bool VariantBlockMapper::build(const std::vector& info_keys, const std::vector& format_keys, const block_footer_type& block_footer){ - this->format_container_global_.clear(); - this->format_container_global_.resize(this->n_format_fields); - this->format_container_local_.clear(); - this->format_container_local_.resize(block_footer.n_format_streams); - this->format_container_loaded_.clear(); - - for(U32 i = 0; i < block_footer.n_format_streams; ++i){ - // Set global -> local mapping -> loaded mapping - this->format_container_global_[block_footer.format_offsets[i].data_header.global_key](-1, i, block_footer.format_offsets[i].data_header.global_key, &block_footer.format_offsets[i]); - // Set local -> global mapping -> loaded mapping - this->format_container_local_[i](-1, i, block_footer.format_offsets[i].data_header.global_key, &block_footer.format_offsets[i]); - } - - for(U32 i = 0; i < format_keys.size(); ++i){ - this->format_container_global_[format_keys[i]].load_order_index = i; - this->format_container_local_[this->format_container_global_[format_keys[i]].stream_id_local].load_order_index = i; - } - - this->info_container_global_.clear(); - this->info_container_global_.resize(this->n_info_fields); - this->info_container_local_.clear(); - this->info_container_local_.resize(block_footer.n_info_streams); - this->info_container_loaded_.clear(); - - for(U32 i = 0; i < block_footer.n_info_streams; ++i){ - // Set global -> local mapping -> loaded mapping - this->info_container_global_[block_footer.info_offsets[i].data_header.global_key](-1, i, block_footer.info_offsets[i].data_header.global_key, &block_footer.info_offsets[i]); - // Set local -> global mapping -> loaded mapping - this->info_container_local_[i](-1, i, block_footer.info_offsets[i].data_header.global_key, &block_footer.info_offsets[i]); - } - - for(U32 i = 0; i < info_keys.size(); ++i){ - this->info_container_global_[info_keys[i]].load_order_index = i; - this->info_container_local_[this->info_container_global_[info_keys[i]].stream_id_local].load_order_index = i; - } - - return true; -} - -} -} diff --git a/lib/containers/variant_block_mapper.h b/lib/containers/variant_block_mapper.h deleted file mode 100644 index 9182fa1..0000000 --- a/lib/containers/variant_block_mapper.h +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef CONTAINERS_VARIANT_BLOCK_MAPPER_H_ -#define CONTAINERS_VARIANT_BLOCK_MAPPER_H_ - -#include "components/variant_block_mapper_entry.h" -#include "components/variant_block_footer.h" -#include "core/header/variant_header.h" -#include "core/data_block_settings.h" - -namespace tachyon{ -namespace containers{ - -class VariantBlockMapperContainer { -private: - typedef VariantBlockMapperContainer self_type; - typedef VariantBlockMapperEntry value_type; - typedef std::size_t size_type; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef value_type* pointer; - typedef const value_type* const_pointer; - -public: - VariantBlockMapperContainer(){} - ~VariantBlockMapperContainer() = default; - - // Capacity - inline const bool empty(void) const{ return(this->values_.empty()); } - inline const size_type size(void) const{ return(this->values_.size()); } - - // Element access - inline pointer data(void){ return(&this->values_[0]); } - inline const_pointer data(void) const{ return(&this->values_[0]); } - inline reference operator[](const U32& position){ return(this->values_[position]); } - inline const_reference operator[](const U32& position) const{ return(this->values_[position]); } - inline reference at(const U32& position){ return(this->values_[position]); } - inline const_reference at(const U32& position) const{ return(this->values_[position]); } - - // Operators - inline void operator+=(const value_type& value){ this->values_.push_back(value); } - - // Utility - inline void clear(void){ this->values_.clear(); } - inline void resize(const size_t& new_size){ this->values_.resize(new_size); } - -private: - std::vector values_; -}; - - -// Forward declaration -class VariantBlockContainer; - -/**< - * Mapper class for VariantBlock: allows easy mapping from - * Global -> local - * Local -> global - * Loaded order -> local - * Patterns -> keys - */ -class VariantBlockMapper { - friend VariantBlockContainer; -private: - typedef VariantBlockMapper self_type; - typedef VariantBlockMapperContainer value_type; - typedef std::size_t size_type; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef value_type* pointer; - typedef const value_type* const_pointer; - typedef VariantBlockMapperEntry map_type; - typedef VariantBlockFooter block_footer_type; - typedef core::VariantHeader header_type; - typedef DataBlockSettings block_settings_type; - -public: - VariantBlockMapper() ; - VariantBlockMapper(const size_t n_format_fields, const size_t n_info_fields); - ~VariantBlockMapper(); - - self_type& operator<<(const header_type& header){ - this->n_format_fields = header.header_magic.n_format_values; - this->n_info_fields = header.header_magic.n_info_values; - return(*this); - } - - /**< - * Update the current mapper object with the provided block footer - * @param block_footer Target block footer providing information used for mappings - * @return Returns TRUE if successful or FALSE otherwise - */ - bool build(const block_footer_type& block_footer); - - /**< - * Update the current mapper object with the provided block footer, desired info keys - * and format keys - * @param info_keys Provided info keys - * @param format_keys Provided format keys - * @param block_footer Associated variant block footer information - * @return Returns TRUE upon success or FALSE otherwise - */ - bool build(const std::vector& info_keys, const std::vector& format_keys, const block_footer_type& block_footer); - - inline bool isLoadedFormatGlobal(const U32& key) const{ return(this->format_container_global_[key].load_order_index != -1); } - inline bool isLoadedInfoGlobal(const U32& key) const{ return(this->info_container_global_[key].load_order_index != -1); } - inline bool isLoadedFormatLocal(const U32& key) const{ return(this->format_container_local_[key].load_order_index != -1); } - inline bool isLoadedInfoLocal(const U32& key) const{ return(this->info_container_local_[key].load_order_index != -1); } - inline map_type& getGlobalFormat(const U32& key){ return(this->format_container_global_[key]); } - inline map_type& getLocalFormat(const U32& key){ return(this->format_container_local_[key]); } - inline map_type& getLoadedFormat(const U32& key){ return(this->format_container_loaded_[key]); } - inline map_type& getGlobalInfo(const U32& key){ return(this->info_container_global_[key]); } - inline map_type& getLocalInfo(const U32& key){ return(this->info_container_local_[key]); } - inline map_type& getLoadedInfo(const U32& key){ return(this->info_container_loaded_[key]); } - inline const map_type& getGlobalFormat(const U32& key) const{ return(this->format_container_global_[key]); } - inline const map_type& getLocalFormat(const U32& key) const{ return(this->format_container_local_[key]); } - inline const map_type& getLoadedFormat(const U32& key) const{ return(this->format_container_loaded_[key]); } - inline const map_type& getGlobalInfo(const U32& key) const{ return(this->info_container_global_[key]); } - inline const map_type& getLocalInfo(const U32& key) const{ return(this->info_container_local_[key]); } - inline const map_type& getLoadedInfo(const U32& key) const{ return(this->info_container_loaded_[key]); } - inline const size_t getNumberFormatLoaded(void) const{ return(this->format_container_loaded_.size()); } - inline const size_t getNumberInfoLoaded(void) const{ return(this->info_container_loaded_.size()); } - -private: - size_type n_format_fields; // Total number of format fields in the file NOT the block - size_type n_info_fields; // Total number of info fields in the file NOT the block - value_type info_container_global_; // Global -> local mapping - value_type info_container_local_; // Local -> global mapping - value_type info_container_loaded_; // Loaded order -> local - value_type format_container_global_;// Global -> local mapping - value_type format_container_local_; // Local -> global mapping - value_type format_container_loaded_;// Loaded order -> local -}; - -} -} - - - -#endif /* CONTAINERS_VARIANT_BLOCK_MAPPER_H_ */ diff --git a/lib/containers/vcf_container.cpp b/lib/containers/vcf_container.cpp new file mode 100644 index 0000000..6608b0b --- /dev/null +++ b/lib/containers/vcf_container.cpp @@ -0,0 +1,154 @@ +#include "vcf_container.h" + +namespace tachyon{ +namespace containers{ + +VcfContainer::VcfContainer(void) : + n_carry_over_(0), + n_entries_(0), + n_capacity_(500), + entries_(new pointer[500]) +{ + for(size_type i = 0; i < this->capacity(); ++i) + this->entries_[i] = nullptr; +} + +VcfContainer::VcfContainer(const size_type& start_capacity) : + n_carry_over_(0), + n_entries_(0), + n_capacity_(start_capacity), + entries_(new pointer[start_capacity]) +{ + for(size_type i = 0; i < this->capacity(); ++i) + this->entries_[i] = nullptr; +} + +VcfContainer::~VcfContainer(){ + if(this->entries_ != nullptr){ + for(std::size_t i = 0; i < this->n_entries_; ++i) + bcf_destroy(this->entries_[i]); + + ::operator delete[](static_cast(this->entries_)); + } +} + +void VcfContainer::resize(const size_t new_size){ + if(new_size < this->capacity()){ + for(size_t i = new_size; i < this->n_entries_; ++i) + bcf_destroy(this->entries_[i]); + + if(this->n_entries_ >= new_size) this->n_entries_ = new_size; + return; + } + + pointer* temp = new pointer[new_size]; + for(size_t i = 0; i < new_size; ++i) temp[i] = nullptr; + for(size_t i = 0; i < this->size(); ++i) temp[i] = this->entries_[i]; + + delete [] this->entries_; + this->entries_ = temp; + this->n_capacity_ = new_size; +} + +bool VcfContainer::GetVariants(const int32_t n_variants, + const int64_t n_bases, + std::unique_ptr& reader) +{ + // Make sure enough records are available for + // overloading. If this is not the case we + // resize the container to the target size + // plus some padding. + if(this->size() + n_variants + this->n_carry_over_ >= this->capacity()) + this->resize(this->size() + n_variants + this->n_carry_over_ + 64); + + + // Look at the back of the vector and check if + // that entry is a valid pointer to a htslib + // bcf1_t structure. If it is not then initialize + // a new bcf1_t structure. Next, read a new + // vcf records from the file stream and overload + // the bcf1_t record. + VcfContainer::pointer bcf1_ = this->end(); + if(bcf1_ == nullptr) bcf1_ = bcf_init(); + if(reader->next(bcf1_) == false) + return false; + + // Push the overloaded bcf1_t structure to this + // container in the back. + *this += bcf1_; + + // Track the current position and contig. + int64_t first_pos = bcf1_->pos; + int32_t first_contig = bcf1_->rid; + if(this->size() != 1){ + first_pos = this->entries_[0]->pos; + first_contig = this->entries_[0]->rid; + } + + if(bcf1_->pos - first_pos > n_bases || first_contig != bcf1_->rid){ + this->n_carry_over_ = 1; + return(this->size() - 1); + } + + for(int32_t i = 1; i < n_variants; ++i){ + bcf1_ = this->end(); + if(bcf1_ == nullptr) bcf1_ = bcf_init(); + + if(reader->next(bcf1_) == false) + return(this->size()); + + *this += bcf1_; + + if(bcf1_->pos - first_pos > n_bases || first_contig != bcf1_->rid){ + this->n_carry_over_ = 1; + return(this->size() - 1); + } + } + + return(this->size()); +} + +io::VcfGenotypeSummary VcfContainer::GetGenotypeSummary(const uint32_t position, const uint64_t& n_samples) const{ + io::VcfGenotypeSummary g; + + // If there are no FORMAT fields there cannot exist any + // GT data. + if(this->at(position)->n_fmt == 0) + return(g); + + // Iterate through the allowed primitive types for genotypes to collect summary + // statistics for genotypes at this loci. Information collected includes the + // base ploidy, if there's any mixed phasing, the number of missing genotypes, and + // the number of samples that has a special end-of-vector encoding. + // Only the signed primitives int8_t, int16_t, and int32_t are valid for genotypes. + switch(this->at(position)->d.fmt[0].type){ + case(BCF_BT_INT8): g.evaluate (n_samples, this->at(position)->d.fmt[0]); break; + case(BCF_BT_INT16): g.evaluate(n_samples, this->at(position)->d.fmt[0]); break; + case(BCF_BT_INT32): g.evaluate(n_samples, this->at(position)->d.fmt[0]); break; + case(BCF_BT_NULL): + case(BCF_BT_FLOAT): + case(BCF_BT_CHAR): + default: + std::cerr << "Illegal genotype primtive type: " << io::BCF_TYPE_LOOKUP[this->at(position)->d.fmt[0].type] << std::endl; + } + + return(g); +} + +void VcfContainer::clear(void){ + uint32_t start_pos = 0; + if(this->n_carry_over_){ + assert(this->size() != 0); + std::swap(this->entries_[this->size() - 1], this->entries_[0]); + start_pos = 1; + this->n_carry_over_ = 0; + } + + for(uint32_t i = start_pos; i < this->size(); ++i) + bcf_clear(this->entries_[i]); + + this->n_entries_ = start_pos; +} + +} +} diff --git a/lib/containers/vcf_container.h b/lib/containers/vcf_container.h new file mode 100644 index 0000000..4e4ea53 --- /dev/null +++ b/lib/containers/vcf_container.h @@ -0,0 +1,70 @@ +#ifndef CONTAINERS_VCF_CONTAINER_H_ +#define CONTAINERS_VCF_CONTAINER_H_ + +#include "components/generic_iterator.h" +#include "io/vcf_utils.h" + +namespace tachyon{ +namespace containers{ + +class VcfContainer{ +public: + typedef VcfContainer self_type; + typedef bcf1_t value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + +public: + VcfContainer(void); + VcfContainer(const size_type& start_capacity); + VcfContainer(const VcfContainer& other) = delete; // Disallow copy ctor + ~VcfContainer(); + + inline const size_type& size(void) const{ return(this->n_entries_); } + inline size_type sizeWithoutCarryOver(void) const{ return(this->n_entries_ - this->n_carry_over_); } + inline const size_type& capacity(void) const{ return(this->n_capacity_); } + inline pointer front(void){ return(this->entries_[0]); } + inline const_pointer front(void) const{ return(this->entries_[0]); } + inline pointer back(void){ return(this->entries_[this->size() == 0 ? 0 : this->size() - 1 - this->n_carry_over_]); } + inline const_pointer back(void) const{ return(this->entries_[this->size() == 0 ? 0 : this->size() - 1 - this->n_carry_over_]); } + + inline void operator+=(const pointer entry){ this->entries_[this->n_entries_++] = entry; } + inline pointer operator[](const uint32_t position){ return(this->entries_[position]); } + inline const_pointer operator[](const uint32_t position) const{ return(this->entries_[position]); } + inline pointer at(const uint32_t position){ return(this->entries_[position]); } + inline const_pointer at(const uint32_t position) const{ return(this->entries_[position]); } + + inline pointer end(void){ return(this->entries_[this->n_entries_]); } + inline const_pointer end(void) const{ return(this->entries_[this->n_entries_]); } + + void resize(const size_t new_size); + bool GetVariants(const int32_t n_variants, const int64_t n_bases, std::unique_ptr& reader); + + // Calculate genotype summary statistics from a lazy evaluated bcf1_t struct. + // Warning: this function does NOT check if the FORMAT field GT exists either + // in the header or in the structure itself. The assumption is that it does + // exist and according to the Bcf specification has to be the first FORMAT + // field set. + io::VcfGenotypeSummary GetGenotypeSummary(const uint32_t position, const uint64_t& n_samples) const; + void clear(void); + +public: + uint32_t n_carry_over_; + size_type n_entries_; + size_type n_capacity_; + pointer* entries_; +}; + +} +} + + + +#endif /* CONTAINERS_VCF_CONTAINER_H_ */ diff --git a/lib/core/data_block_settings.cpp b/lib/core/data_block_settings.cpp index 275beeb..f79d6b2 100644 --- a/lib/core/data_block_settings.cpp +++ b/lib/core/data_block_settings.cpp @@ -1,133 +1,201 @@ +#include "containers/components/variant_block_footer.h" #include "data_block_settings.h" namespace tachyon{ DataBlockSettings::DataBlockSettings() : show_vcf_header(true), - display_ref(false), - display_alt(false), - display_filter(false), + display_ref(true), + display_alt(true), + display_filter(true), + load_static(std::numeric_limits::max()), + display_static(std::numeric_limits::max()), construct_occ_table(false), - custom_delimiter(false), - custom_output_format(false), - custom_delimiter_char('\t'), - output_json(false), - output_format_vector(false), annotate_extra(false) {} -DataBlockSettings& DataBlockSettings::loadAll(const bool set){ - this->loadAllMeta(true); - this->set_membership(set, set); - this->genotypes_all(set, set); - this->genotypes_rle(set, set); - this->genotypes_simple(set, set); - this->genotypes_other(set, set); - this->genotypes_support(set, set); - this->info_all(set, set); - this->format_all(set, set); - this->ppa(set, set); - this->display_alt = true; - this->display_ref = true; - this->display_filter = true; +DataBlockSettings& DataBlockSettings::LoadWrapper(bool set, const int field_bv){ + this->load_static &= ~(field_bv); + if(set) this->load_static |= field_bv; return(*this); } -DataBlockSettings& DataBlockSettings::loadAllMeta(const bool set){ - this->contig(set, set); - this->positions(set, set); - this->controller(set, set); - this->quality(set, set); - this->names(set, set); - this->alleles(set, set); - this->display_alt = true; - this->display_ref = true; - this->display_filter = true; +DataBlockSettings& DataBlockSettings::DisplayWrapper(bool set, const int field_bv){ + this->display_static &= ~(field_bv); + if(set) this->display_static |= field_bv; return(*this); } -DataBlockSettings& DataBlockSettings::loadAllFILTER(const bool set){ - this->set_membership(set, set); - this->display_filter = true; +DataBlockSettings& DataBlockSettings::LoadDisplayWrapper(bool set, const int field_bv){ + this->LoadWrapper(set, field_bv); + this->DisplayWrapper(set, field_bv); return(*this); } -DataBlockSettings& DataBlockSettings::loadAllINFO(const bool set){ - this->info_all(set, set); - this->contig.load = set; - this->positions.load = set; - this->set_membership.load = set; +DataBlockSettings& DataBlockSettings::LoadCore(const bool set){ + for(U32 i = YON_BLK_CONTIG; i <= YON_BLK_ID_FILTER; ++i){ + const U32 bv = 1 << i; + this->LoadWrapper(set, bv); + } + return(*this); +} + +DataBlockSettings& DataBlockSettings::DisplayCore(const bool set){ + for(U32 i = YON_BLK_CONTIG; i <= YON_BLK_ID_FILTER; ++i){ + const U32 bv = 1 << i; + this->DisplayWrapper(set, bv); + } + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadAll(const bool set){ + if(set){ + this->load_static = std::numeric_limits::max(); + } else { + this->load_static = 0; + } return(*this); } -DataBlockSettings& DataBlockSettings::loadINFO(const std::string& field_name){ +DataBlockSettings& DataBlockSettings::DisplayAll(const bool set){ + if(set){ + this->display_static = std::numeric_limits::max(); + } else { + this->display_static = 0; + } + this->display_alt = set; + this->display_ref = set; + this->display_filter = set; + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadAllMeta(const bool set){ + for(U32 i = YON_BLK_CONTIG; i <= YON_BLK_ID_FILTER; ++i){ + const U32 bv = 1 << i; + this->LoadWrapper(set, i); + } + return(*this); +} + +DataBlockSettings& DataBlockSettings::DisplayAllMeta(const bool set){ + for(U32 i = YON_BLK_CONTIG; i <= YON_BLK_ID_FILTER; ++i){ + const U32 bv = 1 << i; + this->DisplayWrapper(set, i); + } + + this->display_alt = set; + this->display_ref = set; + this->display_filter = set; + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadAllFilter(const bool set){ + this->LoadWrapper(set, YON_BLK_BV_ID_INFO); + this->LoadWrapper(set, YON_BLK_BV_ID_FORMAT); + this->LoadWrapper(set, YON_BLK_BV_ID_FILTER); + this->LoadWrapper(set, YON_BLK_BV_CONTROLLER); + + return(*this); +} + +DataBlockSettings& DataBlockSettings::DisplayAllFilter(const bool set){ + this->DisplayWrapper(set, YON_BLK_BV_ID_INFO); + this->DisplayWrapper(set, YON_BLK_BV_ID_FORMAT); + this->DisplayWrapper(set, YON_BLK_BV_ID_FILTER); + this->DisplayWrapper(set, YON_BLK_BV_CONTROLLER); + + this->display_filter = true; + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadAllInfo(const bool set){ + this->LoadWrapper(set, YON_BLK_BV_INFO); // all info + if(set) this->LoadMinimumVcf(true); + + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadInfo(const std::string& field_name){ if(field_name.size() == 0) return(*this); - this->contig.load = true; - this->positions.load = true; - this->set_membership.load = true; - this->controller(true, false); this->info_list.push_back(field_name); + this->LoadMinimumVcf(true); + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadInfo(const U32 field_id){ + this->info_id_global.push_back(field_id); + this->LoadMinimumVcf(true); + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadGenotypes(const bool set){ + this->LoadWrapper(set, YON_BLK_BV_GT); + this->LoadWrapper(set, YON_BLK_BV_PPA); return(*this); } -DataBlockSettings& DataBlockSettings::loadINFO(const U32 field_id){ - this->info_ID_list.push_back(field_id); - this->contig.load = true; - this->positions.load = true; - this->set_membership.load = true; - this->controller.load = true; +DataBlockSettings& DataBlockSettings::DisplayGenotypes(const bool set){ + this->DisplayWrapper(set, YON_BLK_BV_GT); + this->DisplayWrapper(set, YON_BLK_BV_PPA); return(*this); } -DataBlockSettings& DataBlockSettings::loadGenotypes(const bool set){ - this->genotypes_all(set, set); - this->genotypes_rle(set, set); - this->genotypes_simple(set, set); - this->genotypes_other(set, set); - this->genotypes_support.load = set; +DataBlockSettings& DataBlockSettings::LoadPermutationArray(const bool set){ + this->LoadWrapper(set, YON_BLK_BV_PPA); return(*this); } -DataBlockSettings& DataBlockSettings::loadPermutationArray(const bool set){ - this->ppa.load = set; +DataBlockSettings& DataBlockSettings::LoadAllFormat(const bool set){ + this->LoadGenotypes(set); + this->LoadWrapper(set, YON_BLK_BV_FORMAT); // all format + this->LoadCore(set); return(*this); } -DataBlockSettings& DataBlockSettings::loadAllFORMAT(const bool set){ - this->ppa(set, set); - this->loadGenotypes(set); - this->format_all.load = set; - this->contig.load = set; - this->positions.load = set; - this->set_membership.load = set; +DataBlockSettings& DataBlockSettings::DisplayAllFormat(const bool set){ + this->DisplayGenotypes(set); + this->DisplayWrapper(set, YON_BLK_BV_FORMAT); // all format return(*this); } -DataBlockSettings& DataBlockSettings::loadFORMAT(const std::string& field_name){ +DataBlockSettings& DataBlockSettings::DisplayAllInfo(const bool set){ + this->DisplayWrapper(set, YON_BLK_BV_INFO); + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadFormat(const std::string& field_name){ if(field_name.size() == 0) return(*this); - this->contig.load = true; - this->positions.load = true; - this->set_membership.load = true; - if(field_name == "GT") this->loadGenotypes(true); + this->LoadMinimumVcf(true); + if(field_name == "GT") this->LoadGenotypes(true); this->format_list.push_back(field_name); return(*this); } -DataBlockSettings& DataBlockSettings::loadFORMAT(const U32 field_id){ - this->contig.load = true; - this->positions.load = true; - this->set_membership.load = true; - this->format_ID_list.push_back(field_id); +DataBlockSettings& DataBlockSettings::LoadFormat(const U32 field_id){ + this->LoadMinimumVcf(true); + this->format_id_global.push_back(field_id); + return(*this); +} + +DataBlockSettings& DataBlockSettings::LoadMinimumVcf(const bool set){ + this->LoadWrapper(set, YON_BLK_BV_CONTIG); + this->LoadWrapper(set, YON_BLK_BV_POSITION); + this->LoadWrapper(set, YON_BLK_BV_CONTROLLER); + this->LoadWrapper(set, YON_BLK_BV_ID_INFO); + this->LoadWrapper(set, YON_BLK_BV_ID_FORMAT); + this->LoadWrapper(set, YON_BLK_BV_ID_FILTER); + this->LoadWrapper(set, YON_BLK_BV_ALLELES); + this->LoadWrapper(set, YON_BLK_BV_REFALT); return(*this); } -DataBlockSettings& DataBlockSettings::setCustomDelimiter(const char delimiter){ - this->custom_delimiter = true; - this->custom_delimiter_char = delimiter; +DataBlockSettings& DataBlockSettings::DisplayMinimumVcf(const bool set){ + this->DisplayWrapper(set, YON_BLK_BV_CONTIG); + this->DisplayWrapper(set, YON_BLK_BV_POSITION); return(*this); } -bool DataBlockSettings::parse(const header_type& header){ +bool DataBlockSettings::Parse(const header_type& header){ std::regex field_identifier_regex("^[A-Za-z_0-9]{1,}$"); for(U32 i = 0; i < this->info_list.size(); ++i){ @@ -135,17 +203,17 @@ bool DataBlockSettings::parse(const header_type& header){ for(U32 j = 0; j < ind.size(); ++j){ ind[j] = utility::remove_excess_whitespace(ind[j]); if(std::regex_match(ind[j], field_identifier_regex)){ - const header_map_type* map = header.getInfoField(ind[j]); - if(map == nullptr){ + const io::VcfInfo* info = header.GetInfo(ind[j]); + if(info == nullptr){ std::cerr << utility::timestamp("ERROR") << "Cannot find INFO field: " << ind[j] << " in string " << this->info_list[i] << std::endl; continue; } - this->loadINFO(ind[j]); + this->LoadInfo(ind[j]); } else { std::cerr << utility::timestamp("ERROR") << "Illegal field name: " << ind[j] << ". Must match \"[A-Za-z_0-9]\"..." << std::endl; return(false); } - this->loadINFO(ind[j]); + this->LoadInfo(ind[j]); } } @@ -156,10 +224,12 @@ bool DataBlockSettings::parse(const header_type& header){ return true; } -bool DataBlockSettings::parseCommandString(const std::vector& command, const header_type& header, const bool customOutputFormat){ - this->custom_output_format = customOutputFormat; // Todo +bool DataBlockSettings::ParseCommandString(const std::vector& command, const header_type& header){ bool allGood = true; + this->display_static = 0; + this->load_static = 0; + std::regex field_identifier_regex("^[A-Za-z_0-9]{1,}$"); for(U32 i = 0; i < command.size(); ++i){ std::vector partitions = utility::split(command[i], ';'); @@ -170,29 +240,25 @@ bool DataBlockSettings::parseCommandString(const std::vector& comma for(U32 j = 0; j < ind.size(); ++j){ ind[j] = utility::remove_excess_whitespace(ind[j]); if(std::regex_match(ind[j], field_identifier_regex)){ - const header_map_type* map = header.getInfoField(ind[j]); - if(map == nullptr){ + const io::VcfInfo* info = header.GetInfo(ind[j]); + if(info == nullptr){ std::cerr << utility::timestamp("ERROR") << "Cannot find INFO field: " << ind[j] << " in string " << partitions[p] << std::endl; allGood = false; continue; } - this->loadINFO(ind[j]); + this->LoadInfo(info->idx); + this->DisplayAllInfo(true); } else { std::cerr << utility::timestamp("ERROR") << "Illegal field name: " << ind[j] << ". Must match \"[A-Za-z_0-9]\"..." << std::endl; allGood = false; } } } else if(strncasecmp(partitions[p].data(), "INFO", 4) == 0 && partitions[p].size() == 4){ - this->loadAllINFO(true); + this->LoadAllInfo(true); + this->DisplayAllInfo(true); } else if(strncasecmp(partitions[p].data(), "FORMAT", 6) == 0 && partitions[p].size() == 6){ - this->format_all(true, true); - this->set_membership(true, true); - this->loadGenotypes(true); - this->set_membership.load = true; - this->positions.load = true; - this->contig.load = true; - this->ppa.load = true; - this->controller.load = true; + this->LoadAllFormat(true); + this->DisplayAllFormat(true); } else if(strncasecmp(partitions[p].data(), "FORMAT=", 7) == 0){ std::vector ind = utility::split(partitions[p].substr(7,command.size()-7), ','); @@ -201,27 +267,42 @@ bool DataBlockSettings::parseCommandString(const std::vector& comma if(std::regex_match(ind[j], field_identifier_regex)){ // Special case for genotypes if(strncasecmp(ind[j].data(), "GT", 2) == 0 && ind[j].size() == 2){ - this->contig.load = true; - this->positions.load = true; - this->controller.load = true; - this->loadGenotypes(true); - this->set_membership.load = true; + this->LoadMinimumVcf(true); + this->LoadGenotypes(true); + + const io::VcfFormat* fmt = header.GetFormat(ind[j]); + if(fmt == nullptr){ + std::cerr << utility::timestamp("ERROR") << "Cannot find FORMAT field: " << ind[j] << " in string " << partitions[p] << std::endl; + allGood = false; + continue; + } + this->LoadFormat(fmt->idx); + this->DisplayAllFormat(true); + } else if(strncasecmp(ind[j].data(), "GENOTYPES", 9) == 0 && ind[j].size() == 9){ - this->contig.load = true; - this->positions.load = true; - this->controller.load = true; - this->loadGenotypes(true); - this->set_membership.load = true; + this->LoadMinimumVcf(true); + this->LoadGenotypes(true); + this->DisplayAllFormat(true); + + const io::VcfFormat* fmt = header.GetFormat(ind[j]); + if(fmt == nullptr){ + std::cerr << utility::timestamp("ERROR") << "Cannot find FORMAT field: " << ind[j] << " in string " << partitions[p] << std::endl; + allGood = false; + continue; + } + this->LoadFormat(fmt->idx); + this->DisplayAllFormat(true); } // Any other FORMAT else { - const header_map_type* map = header.getFormatField(ind[j]); - if(map == nullptr){ + const io::VcfFormat* fmt = header.GetFormat(ind[j]); + if(fmt == nullptr){ std::cerr << utility::timestamp("ERROR") << "Cannot find FORMAT field: " << ind[j] << " in string " << partitions[p] << std::endl; allGood = false; continue; } - this->loadFORMAT(ind[j]); + this->LoadFormat(fmt->idx); + this->DisplayAllFormat(true); } } else { std::cerr << utility::timestamp("ERROR") << "Cannot find FORMAT field: " << ind[j] << " in string " << partitions[p] << std::endl; @@ -232,29 +313,43 @@ bool DataBlockSettings::parseCommandString(const std::vector& comma } else if((strncasecmp(partitions[p].data(), "CONTIG", 6) == 0 && partitions[p].length() == 6) || (strncasecmp(partitions[p].data(), "CHROM", 5) == 0 && partitions[p].length() == 5) || (strncasecmp(partitions[p].data(), "CHROMOSOME", 10) == 0 && partitions[p].length() == 10)){ - this->contig(true, true); + this->LoadWrapper(true, YON_BLK_BV_CONTIG); + this->DisplayWrapper(true, YON_BLK_BV_CONTIG); } else if((strncasecmp(partitions[p].data(), "POSITION", 8) == 0 && partitions[p].length() == 8) || (strncasecmp(partitions[p].data(), "POS", 3) == 0 && partitions[p].length() == 3)){ - this->positions(true, true); + this->LoadWrapper(true, YON_BLK_BV_POSITION); } else if((strncasecmp(partitions[p].data(), "REF", 3) == 0 && partitions[p].length() == 3) || (strncasecmp(partitions[p].data(), "REFERENCE", 9) == 0 && partitions[p].length() == 9)){ - this->alleles(true, true); - this->controller(true, true); + this->LoadWrapper(true, YON_BLK_BV_ALLELES); + this->LoadWrapper(true, YON_BLK_BV_REFALT); + this->LoadWrapper(true, YON_BLK_BV_CONTROLLER); + this->DisplayWrapper(true, YON_BLK_BV_ALLELES); + this->DisplayWrapper(true, YON_BLK_BV_REFALT); + this->DisplayWrapper(true, YON_BLK_BV_CONTROLLER); this->display_ref = true; } else if((strncasecmp(partitions[p].data(), "ALT", 3) == 0 && partitions[p].length() == 3) || (strncasecmp(partitions[p].data(), "ALTERNATE", 9) == 0 && partitions[p].length() == 9)){ - this->alleles(true, true); - this->controller(true, true); + this->LoadWrapper(true, YON_BLK_BV_ALLELES); + this->LoadWrapper(true, YON_BLK_BV_REFALT); + this->LoadWrapper(true, YON_BLK_BV_CONTROLLER); + this->DisplayWrapper(true, YON_BLK_BV_ALLELES); + this->DisplayWrapper(true, YON_BLK_BV_REFALT); + this->DisplayWrapper(true, YON_BLK_BV_CONTROLLER); this->display_alt = true; } else if((strncasecmp(partitions[p].data(), "QUALITY", 7) == 0 && partitions[p].length() == 7) || (strncasecmp(partitions[p].data(), "QUAL", 4) == 0 && partitions[p].length() == 4)){ - this->quality(true, true); + this->LoadWrapper(true, YON_BLK_BV_QUALITY); + this->DisplayWrapper(true, YON_BLK_BV_QUALITY); } else if((strncasecmp(partitions[p].data(), "NAMES", 5) == 0 && partitions[p].length() == 5) || (strncasecmp(partitions[p].data(), "NAME", 4) == 0 && partitions[p].length() == 4)){ - this->names(true, true); + this->LoadWrapper(true, YON_BLK_BV_NAMES); + this->DisplayWrapper(true, YON_BLK_BV_NAMES); } else if((strncasecmp(partitions[p].data(), "FILTERS", 7) == 0 && partitions[p].length() == 7) || (strncasecmp(partitions[p].data(), "FILTER", 6) == 0 && partitions[p].length() == 6)){ - this->set_membership(true, true); + this->LoadWrapper(true, YON_BLK_BV_CONTROLLER); + this->LoadWrapper(true, YON_BLK_BV_ID_FILTER); + this->DisplayWrapper(true, YON_BLK_BV_CONTROLLER); + this->DisplayWrapper(true, YON_BLK_BV_ID_FILTER); this->display_filter = true; } else { std::cerr << utility::timestamp("ERROR") << "Unknown pattern: " << partitions[p] << std::endl; @@ -267,29 +362,4 @@ bool DataBlockSettings::parseCommandString(const std::vector& comma return true; } - -bool DataBlockSettings::parseSettings(const header_type& header){ - this->info_ID_list.clear(); - for(U32 i = 0; i < this->info_list.size(); ++i){ - const core::HeaderMapEntry* map_entry = header.getInfoField(this->info_list[i]); - if(map_entry == nullptr) continue; - const S32 global_key = map_entry->IDX; - if(global_key >= 0){ - this->info_ID_list.push_back(global_key); - } - } - - this->format_ID_list.clear(); - for(U32 i = 0; i < this->format_list.size(); ++i){ - const core::HeaderMapEntry* map_entry = header.getFormatField(this->format_list[i]); - if(map_entry == nullptr) continue; - const S32 global_key = map_entry->IDX; - if(global_key >= 0){ - this->format_ID_list.push_back(global_key); - } - } - - return true; -} - } diff --git a/lib/core/data_block_settings.h b/lib/core/data_block_settings.h index 3fda54e..8d78a7f 100644 --- a/lib/core/data_block_settings.h +++ b/lib/core/data_block_settings.h @@ -8,50 +8,46 @@ namespace tachyon{ -struct DataBlockSettingsPair{ -public: - DataBlockSettingsPair() : load(false), display(false){} - DataBlockSettingsPair(const bool load, const bool display) : load(load), display(display){} - ~DataBlockSettingsPair() = default; - - inline void operator()(const bool& load){ this->load = load; this->display = load; } - inline void operator()(const bool& load, const bool& display){ this->load = load; this->display = display; } - -public: - bool load; - bool display; -}; - /**< * Load and display Settings for the basic variant data block */ struct DataBlockSettings{ public: typedef DataBlockSettings self_type; - typedef DataBlockSettingsPair pair_type; - typedef core::VariantHeader header_type; - typedef core::HeaderMapEntry header_map_type; + typedef VariantHeader header_type; public: DataBlockSettings(); ~DataBlockSettings() = default; - self_type& loadAll(const bool set = true); - self_type& loadAllMeta(const bool set = true); - self_type& loadAllFILTER(const bool set = true); - self_type& loadAllINFO(const bool set = true); - self_type& loadINFO(const std::string& field_name); - self_type& loadINFO(const U32 field_id); - self_type& loadGenotypes(const bool set); - self_type& loadPermutationArray(const bool set); - self_type& loadAllFORMAT(const bool set); - self_type& loadFORMAT(const std::string& field_name); - self_type& loadFORMAT(const U32 field_id); - self_type& setCustomDelimiter(const char delimiter); - - bool parse(const header_type& header); - bool parseCommandString(const std::vector& command, const header_type& header, const bool customOutputFormat = false); - bool parseSettings(const header_type& header); + self_type& LoadWrapper(bool set, const int field_bv); + self_type& DisplayWrapper(bool set, const int field_bv); + self_type& LoadDisplayWrapper(bool set, const int field_bv); + self_type& LoadCore(const bool set = true); + self_type& LoadAll(const bool set = true); + self_type& LoadAllMeta(const bool set = true); + self_type& LoadAllFilter(const bool set = true); + self_type& LoadAllInfo(const bool set = true); + self_type& LoadInfo(const std::string& field_name); + self_type& LoadInfo(const U32 field_id); + self_type& LoadGenotypes(const bool set); + self_type& LoadPermutationArray(const bool set); + self_type& LoadAllFormat(const bool set); + self_type& LoadFormat(const std::string& field_name); + self_type& LoadFormat(const U32 field_id); + self_type& LoadMinimumVcf(const bool set = true); + + self_type& DisplayCore(const bool set = true); + self_type& DisplayAll(const bool set = true); + self_type& DisplayAllMeta(const bool set = true); + self_type& DisplayAllFilter(const bool set = true); + self_type& DisplayAllInfo(const bool set = true); + self_type& DisplayGenotypes(const bool set); + self_type& DisplayAllFormat(const bool set); + self_type& DisplayMinimumVcf(const bool set = true); + + bool Parse(const header_type& header); + bool ParseCommandString(const std::vector& command, const header_type& header); public: bool show_vcf_header; @@ -62,29 +58,10 @@ struct DataBlockSettings{ bool display_filter; // Load/display pairs - pair_type contig; - pair_type positions; - pair_type controller; - pair_type quality; - pair_type names; - pair_type alleles; - pair_type set_membership; - pair_type genotypes_all; - pair_type genotypes_rle; - pair_type genotypes_simple; - pair_type genotypes_other; - pair_type genotypes_support; - pair_type ppa; - pair_type info_all; - pair_type format_all; + U32 load_static; + U32 display_static; bool construct_occ_table; - bool custom_delimiter; - bool custom_output_format; - char custom_delimiter_char; - - bool output_json; - bool output_format_vector; bool annotate_extra; @@ -93,8 +70,8 @@ struct DataBlockSettings{ std::vector info_list; std::vector format_list; - std::vector info_ID_list; - std::vector format_ID_list; + std::vector info_id_global; + std::vector format_id_global; // blocks to load std::vector blocks_numbers; diff --git a/lib/core/footer/footer.h b/lib/core/footer/footer.h index 180706b..a0f482d 100644 --- a/lib/core/footer/footer.h +++ b/lib/core/footer/footer.h @@ -1,13 +1,15 @@ #ifndef CORE_FOOTER_FOOTER_H_ #define CORE_FOOTER_FOOTER_H_ + +#include "support/type_definitions.h" #include "support/magic_constants.h" #include "support/helpers.h" namespace tachyon{ namespace core{ -#define YON_FOOTER_LENGTH (constants::TACHYON_FILE_EOF_LENGTH + sizeof(U64)*3 + sizeof(U16)) +#define YON_FOOTER_LENGTH ((constants::TACHYON_FILE_EOF_LENGTH) + sizeof(U64)*3 + sizeof(U16)) struct Footer{ public: @@ -52,7 +54,7 @@ struct Footer{ inline const U16& getController(void) const{ return(this->controller); } inline U16& getController(void){ return(this->controller); } - inline const bool validate(void) const{ + inline bool validate(void) const{ if(this->offset_end_of_data == 0) return false; if(this->n_blocks == 0) return false; if(this->n_variants == 0) return false; diff --git a/lib/core/genotype_object.h b/lib/core/genotype_object.h deleted file mode 100644 index 921a05b..0000000 --- a/lib/core/genotype_object.h +++ /dev/null @@ -1,357 +0,0 @@ -#ifndef CORE_GENOTYPE_OBJECT_H_ -#define CORE_GENOTYPE_OBJECT_H_ - -#include "meta_entry.h" - -namespace tachyon{ -namespace core{ - -#define YON_GT_RLE_ALLELE_A(PRIMITITVE, SHIFT, ADD) (((PRIMITITVE) & ((1 << (SHIFT)) - 1) << (ADD)) >> (ADD)); -#define YON_GT_RLE_ALLELE_B(PRIMITIVE, SHIFT, ADD) (((PRIMITIVE) & ((1 << (SHIFT)) - 1) << ((ADD)+(SHIFT))) >> ((ADD)+(SHIFT))); -#define YON_GT_RLE_LENGTH(PRIMITIVE, SHIFT, ADD) ((PRIMITIVE) >> (2*(SHIFT) + (ADD))) -#define YON_GT_DIPLOID_ALLELE_LOOKUP(A,B,shift,mask) (((A) & (mask)) << (shift)) | ((B) & (mask)) -#define YON_GT_DIPLOID_BCF_A(PRIMITIVE, SHIFT) (((PRIMITIVE) >> ((SHIFT) + 1)) & (((U64)1 << (SHIFT)) - 1)) -#define YON_GT_DIPLOID_BCF_B(PRIMITIVE, SHIFT) (((PRIMITIVE) >> 1) & (((U64)1 << (SHIFT)) - 1)) -#define YON_GT_DIPLOID_BCF_PHASE(PRIMITIVE) ((PRIMITIVE) & 1) - -const SBYTE YON_GT_RLE_CORRECTION[3] = {0, 0, 4}; -const SBYTE YON_GT_RLE_RECODE[3] = {0,1,-2}; - -struct GTObjectAllele{ -public: - GTObjectAllele() : phase(0), allele(0){} - ~GTObjectAllele() = default; - -public: - BYTE phase; - SBYTE allele; -}; - -struct GTObject{ -private: - typedef GTObject self_type; - typedef GTObjectAllele allele_type; - typedef allele_type value_type; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef value_type* pointer; - typedef const value_type* const_pointer; - typedef std::size_t size_type; - -public: - GTObject(void) : - n_ploidy(0), - n_objects(0), - alleles(nullptr) - {} - - GTObject(const self_type& other) : - n_ploidy(other.n_ploidy), - n_objects(other.n_objects), - alleles(new value_type[other.n_objects]) - { - for(U32 i = 0; i < this->n_ploidy; ++i) - this->alleles[i] = other.alleles[i]; - } - - GTObject(self_type&& other) noexcept : - n_ploidy(other.n_ploidy), - n_objects(other.n_objects), - alleles(other.alleles) - { - other.alleles = nullptr; - } - - GTObject& operator=(const self_type& other){ - if(this->n_ploidy == other.n_ploidy){ - for(U32 i = 0; i < this->n_ploidy; ++i) - this->alleles[i] = other.alleles[i]; - } else { - delete [] this->alleles; - this->alleles = new value_type[other.n_ploidy]; - - for(U32 i = 0; i < other.n_ploidy; ++i) - this->alleles[i] = other.alleles[i]; - } - - this->n_ploidy = other.n_ploidy; - this-> n_objects = other.n_objects; - return(*this); - } - - GTObject& operator=(self_type&& other) noexcept{ - if (this == &other) - return *this; - - this->n_ploidy = other.n_ploidy; - this->n_objects = other.n_objects; - delete [] this->alleles; - this->alleles = other.alleles; - other.alleles = nullptr; - return *this; - } - - virtual ~GTObject(void){ delete [] this->alleles; } - - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - // Capacity - inline const size_t size(void) const{ return(this->n_objects); } - - // Element access - inline pointer data(void){ return(this->alleles); } - inline const_pointer data(void) const{ return(this->alleles); } - inline reference operator[](const U32& position){ return(this->alleles[position]); } - inline const_reference operator[](const U32& position) const{ return(this->alleles[position]); } - inline reference at(const U32& position){ return(this->alleles[position]); } - inline const_reference at(const U32& position) const{ return(this->alleles[position]); } - - // Iterator - inline iterator begin(){ return iterator(&this->alleles[0]); } - inline iterator end(){ return iterator(&this->alleles[this->n_objects]); } - inline const_iterator begin() const{ return const_iterator(&this->alleles[0]); } - inline const_iterator end() const{ return const_iterator(&this->alleles[this->n_objects]); } - inline const_iterator cbegin() const{ return const_iterator(&this->alleles[0]); } - inline const_iterator cend() const{ return const_iterator(&this->alleles[this->n_objects]); } - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - if(entry.n_ploidy){ - if(entry.alleles[0].allele == -1){ - stream.put('.'); - return(stream); - } - if(entry.alleles[0].allele == -2) stream.put('.'); - else stream << (int)entry.alleles[0].allele; - - for(U32 i = 1; i < entry.n_ploidy; ++i){ - if(entry.alleles[i].allele == -1) break; - stream << (entry.alleles[i].phase ? '|' : '/'); - if(entry.alleles[i].allele == -2) stream.put('.'); - else stream << (int)entry.alleles[i].allele; - } - } - return(stream); - } - - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& entry){ - if(entry.n_ploidy){ - if(entry.alleles[0].allele == -1){ - buffer += '.'; - return(buffer); - } - if(entry.alleles[0].allele == -2) buffer += '.'; - else buffer.AddReadble(entry.alleles[0].allele); - - for(U32 i = 1; i < entry.n_ploidy; ++i){ - if(entry.alleles[i].allele == -1) break; - buffer += (entry.alleles[i].phase ? '|' : '/'); - if(entry.alleles[i].allele == -2) buffer += '.'; - else buffer.AddReadble(entry.alleles[i].allele); - } - } - return(buffer); - } - -public: - BYTE n_ploidy; // ploidy of objects - size_type n_objects;// number of objects - pointer alleles; // alleleic data -> length = n_ploidy -}; - -struct GTObjectDiploidRLE : public GTObject{ -private: - typedef GTObjectDiploidRLE self_type; - typedef core::MetaEntry meta_type; - -public: - GTObjectDiploidRLE(void){} - - template - void operator()(const T& gt_primitive, const meta_type& meta_entry) - { - this->__interpret(gt_primitive, meta_entry); - } - - void operator()(const U32& n_entries, const S32& alleleA, const S32& alleleB, const bool& phase) - { - this->__interpret(n_entries, alleleA, alleleB, phase); - } - - void __interpret(const U32& n_entries, const S32& alleleA, const S32& alleleB, const bool& phase) - { - if(this->alleles == nullptr || this->n_ploidy != 2){ - delete [] this->alleles; - this->n_ploidy = 2; - this->alleles = new core::GTObjectAllele[2]; - } - - this->n_objects = n_entries; - this->alleles[0].allele = alleleA; - this->alleles[1].allele = alleleB; - this->alleles[0].phase = phase; - } - -private: - template - void __interpret(const T& gt_primitive, const meta_type& meta_entry) - { - if(this->alleles == nullptr || this->n_ploidy != 2){ - delete [] this->alleles; - this->n_ploidy = 2; - this->alleles = new core::GTObjectAllele[2]; - } - const BYTE shift = meta_entry.isAnyGTMissing() ? 2 : 1; - const BYTE add = meta_entry.isGTMixedPhasing() ? 1 : 0; - - if(add) this->alleles[0].phase = gt_primitive & 1; - else this->alleles[0].phase = meta_entry.getControllerPhase(); - - this->n_objects = YON_GT_RLE_LENGTH(gt_primitive, shift, add); - this->alleles[0].allele = YON_GT_RLE_ALLELE_A(gt_primitive, shift, add); - this->alleles[1].allele = YON_GT_RLE_ALLELE_B(gt_primitive, shift, add); - } -}; - -struct GTObjectDiploidSimple : public GTObject{ -private: - typedef GTObjectDiploidSimple self_type; - typedef core::MetaEntry meta_type; - -public: - GTObjectDiploidSimple(void){} - - template - void operator()(const T& gt_primitive, const meta_type& meta_entry) - { - this->__interpret(gt_primitive, meta_entry); - } - - void operator()(const U32& n_entries, const S32& alleleA, const S32& alleleB, const bool& phase) - { - this->__interpret(n_entries, alleleA, alleleB, phase); - } - - void __interpret(const U32& n_entries, const S32& alleleA, const S32& alleleB, const bool& phase) - { - if(this->alleles == nullptr || this->n_ploidy != 2){ - delete [] this->alleles; - this->n_ploidy = 2; - this->alleles = new core::GTObjectAllele[2]; - } - - this->n_objects = n_entries; - this->alleles[0].allele = alleleA; - this->alleles[1].allele = alleleB; - this->alleles[0].phase = phase; - } - -private: - template - void __interpret(const T& gt_primitive, const meta_type& meta_entry) - { - if(this->alleles == nullptr || this->n_ploidy != 2){ - delete [] this->alleles; - this->n_ploidy = 2; - this->alleles = new core::GTObjectAllele[2]; - } - const BYTE shift = ceil(log2(meta_entry.getNumberAlleles() + 1 + meta_entry.isAnyGTMissing())); // Bits occupied per allele, 1 value for missing - const BYTE add = meta_entry.isGTMixedPhasing() ? 1 : 0; - - if(add) this->alleles[0].phase = gt_primitive & 1; - else this->alleles[0].phase = meta_entry.getControllerPhase(); - - this->n_objects = YON_GT_RLE_LENGTH(gt_primitive, shift, add); - this->alleles[0].allele = YON_GT_RLE_ALLELE_A(gt_primitive, shift, add); - this->alleles[1].allele = YON_GT_RLE_ALLELE_B(gt_primitive, shift, add); - } -}; - -struct GTObjectDiploidBCF : public GTObject{ -private: - typedef GTObjectDiploidBCF self_type; - typedef core::MetaEntry meta_type; - -public: - GTObjectDiploidBCF(void){} - - template - void operator()(const T& gt_primitive, const meta_type& meta_entry) - { - this->__interpret(gt_primitive, meta_entry); - } - - void operator()(const U32& n_entries, const S32& alleleA, const S32& alleleB, const bool& phase) - { - this->__interpret(n_entries, alleleA, alleleB, phase); - } - - void __interpret(const U32& n_entries, const S32& alleleA, const S32& alleleB, const bool& phase) - { - if(this->alleles == nullptr || this->n_ploidy != 2){ - delete [] this->alleles; - this->n_ploidy = 2; - this->alleles = new core::GTObjectAllele[2]; - } - - this->n_objects = n_entries; - this->alleles[0].allele = alleleA; - this->alleles[1].allele = alleleB; - this->alleles[0].phase = phase; - } - -private: - template - void __interpret(const T& gt_primitive, const meta_type& meta_entry) - { - if(this->alleles == nullptr || this->n_ploidy != 2){ - delete [] this->alleles; - this->n_ploidy = 2; - this->alleles = new core::GTObjectAllele[2]; - } - const BYTE shift = (sizeof(T)*8 - 1) / 2; - - this->alleles[0].phase = YON_GT_DIPLOID_BCF_PHASE(gt_primitive); - this->n_objects = 1; - this->alleles[0].allele = YON_GT_DIPLOID_BCF_A(gt_primitive, shift); - this->alleles[1].allele = YON_GT_DIPLOID_BCF_B(gt_primitive, shift); - } -}; - -} -} - - - -#endif /* GTOBJECT_H_ */ diff --git a/lib/core/genotype_summary.h b/lib/core/genotype_summary.h deleted file mode 100644 index 999c0de..0000000 --- a/lib/core/genotype_summary.h +++ /dev/null @@ -1,345 +0,0 @@ -#ifndef ENCODERGENOTYPESRLEHELPER_H_ -#define ENCODERGENOTYPESRLEHELPER_H_ - -#include "genotype_object.h" - -namespace tachyon{ -namespace containers{ - -// Forward declare -class GenotypeContainerInterface; -template class GenotypeContainerDiploidRLE; -template class GenotypeContainerDiploidSimple; -template class GenotypeContainerDiploidBCF; - -// Remaps -const BYTE TACHYON_GT_SUMMARY_REMAP[4] = {2, 3, 1, 1}; // 0 = EOV does not exist in this encoding - -struct GenotypeSummaryObject{ - GenotypeSummaryObject() : counts_(0), countsA_(0), countsB_(0){} - ~GenotypeSummaryObject(){} - - void reset(void){ - this->counts_ = 0; - this->countsA_ = 0; - this->countsB_ = 0; - } - - void operator+=(const U64& value){ this->counts_ += value; } - - U64 counts_; - U64 countsA_; - U64 countsB_; -}; - -struct GenotypeSummary{ - typedef U64 value_type; - typedef core::MetaEntry meta_type; - -public: - GenotypeSummary() : - n_alleles_(10), - matrix_(new value_type*[this->n_alleles_]), - vectorA_(new value_type[this->n_alleles_]), - vectorB_(new value_type[this->n_alleles_]) - { - for(U32 i = 0; i < this->n_alleles_; ++i){ - this->matrix_[i] = new value_type[this->n_alleles_]; - memset(this->matrix_[i], 0, sizeof(value_type)*this->n_alleles_); - } - memset(this->vectorA_, 0, sizeof(value_type)*this->n_alleles_); - memset(this->vectorB_, 0, sizeof(value_type)*this->n_alleles_); - } - - GenotypeSummary(const BYTE n_alleles) : - n_alleles_(n_alleles + 2), - matrix_(new value_type*[this->n_alleles_]), - vectorA_(new value_type[this->n_alleles_]), - vectorB_(new value_type[this->n_alleles_]) - { - for(U32 i = 0; i < this->n_alleles_; ++i){ - this->matrix_[i] = new value_type[this->n_alleles_]; - memset(this->matrix_[i], 0, sizeof(value_type)*this->n_alleles_); - } - memset(this->vectorA_, 0, sizeof(value_type)*this->n_alleles_); - memset(this->vectorB_, 0, sizeof(value_type)*this->n_alleles_); - } - - ~GenotypeSummary(){ - for(U32 i = 0; i < this->n_alleles_; ++i) - delete [] this->matrix_[i]; - delete [] this->matrix_; - delete [] this->vectorA_; - delete [] this->vectorB_; - } - - void clear(void){ - for(U32 i = 0; i < this->n_alleles_; ++i){ - memset(this->matrix_[i], 0, sizeof(value_type)*this->n_alleles_); - } - memset(this->vectorA_, 0, sizeof(value_type)*this->n_alleles_); - memset(this->vectorB_, 0, sizeof(value_type)*this->n_alleles_); - } - - void printDiploid(std::ostream& stream){ - stream << this->matrix_[0][0]; - for(U32 j = 1; j < this->n_alleles_; ++j){ - if(this->matrix_[0][j]) stream << '\t' << this->matrix_[0][j]; - } - - for(U32 i = 1; i < this->n_alleles_; ++i){ - for(U32 j = 0; j < this->n_alleles_; ++j){ - if(this->matrix_[i][j]) stream << '\t' << this->matrix_[i][j]; - } - } - } - - U64 alleleCount(void) const{ - U64 total = 0; - for(U32 i = 2; i < this->n_alleles_; ++i){ - total += this->vectorA_[i]; - total += this->vectorB_[i]; - } - return(total); - } - - U64 alleleCountA(void) const{ - U64 total = 0; - for(U32 i = 2; i < this->n_alleles_; ++i){ - total += this->vectorA_[i]; - } - return(total); - } - - U64 alleleCountB(void) const{ - U64 total = 0; - for(U32 i = 2; i < this->n_alleles_; ++i){ - total += this->vectorB_[i]; - } - return(total); - } - - U64 genotypeCount(void) const{ - U64 total = 0; - for(U32 i = 2; i < this->n_alleles_; ++i){ - for(U32 j = 2; j < this->n_alleles_; ++j){ - total += this->matrix_[i][j]; - } - } - return(total); - } - - /**< - // This code implements an exact SNP test of Hardy-Weinberg Equilibrium as described in - // Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on Exact Tests of - // Hardy-Weinberg Equilibrium. American Journal of Human Genetics. 76: 000 - 000 - // - // Written by Jan Wigginton - // Modified to use Tachyon data by Marcus D. R. Klarqvist (https;//github.com/mklarqvist/tachyon) - */ - std::vector calculateHardyWeinberg(const meta_type& meta) const{ - if(meta.n_alleles == 1) return std::vector(); - const BYTE n_limit = meta.n_alleles + 2 > this->n_alleles_ ? this->n_alleles_ - 1 : meta.n_alleles - 1; - std::vector results(n_limit, 1); - - for(U32 i = 0; i < n_limit; ++i) - results[i] = this->__calculateHardyWeinberg(2, 3+i); - - return(results); - } - - std::vector calculateAlleleFrequency(const meta_type& meta) const{ - const BYTE n_limit = meta.n_alleles + 2 > this->n_alleles_ - ? this->n_alleles_ - 2 - : meta.n_alleles; - std::vector results(n_limit, 0); - - U64 n_total = 0; - for(U32 i = 0; i < n_limit; ++i){ - results[i] = this->vectorA_[2+i] + this->vectorB_[2+i]; - n_total += results[i]; - } - for(U32 i = 0; i < n_limit; ++i) results[i] /= n_total; - - return(results); - } - - std::vector calculateInbreedingCoefficient(const meta_type& meta) const{ - // Allele frequency of A - const double p = ((double)2*this->matrix_[2][2] + this->matrix_[2][3] + this->matrix_[3][2]) / (2*this->genotypeCount()); - // Genotype frequency of heterozygotes - const double pg = ((double)this->matrix_[2][3] + this->matrix_[3][2]) / this->genotypeCount(); - // Expected heterozygosity - const double exp = 2*p*(1-p); - // Population inbreeding coefficient: F - const double f_pic = exp > 0 ? (exp-pg)/exp : 0; - - return(std::vector()); - } - - template - inline void operator+=(const GenotypeContainerDiploidRLE& gt_rle_container){ - const BYTE shift = gt_rle_container.getMeta().isAnyGTMissing() ? 2 : 1; - const BYTE add = gt_rle_container.getMeta().isGTMixedPhasing() ? 1 : 0; - - for(U32 i = 0; i < gt_rle_container.size(); ++i){ - const U64 length = YON_GT_RLE_LENGTH(gt_rle_container.at(i), shift, add); - const BYTE alleleA = YON_GT_RLE_ALLELE_A(gt_rle_container.at(i), shift, add); - const BYTE alleleB = YON_GT_RLE_ALLELE_B(gt_rle_container.at(i), shift, add); - - this->matrix_[TACHYON_GT_SUMMARY_REMAP[alleleA]][TACHYON_GT_SUMMARY_REMAP[alleleB]] += length; - this->vectorA_[TACHYON_GT_SUMMARY_REMAP[alleleA]] += length; - this->vectorB_[TACHYON_GT_SUMMARY_REMAP[alleleB]] += length; - } - } - - template - inline void operator+=(const GenotypeContainerDiploidSimple& gt_simple_container){ - const BYTE shift = ceil(log2(gt_simple_container.getMeta().getNumberAlleles() + 2 + 1)); // Bits occupied per allele, 1 value for missing - const BYTE add = gt_simple_container.getMeta().isGTMixedPhasing() ? 1 : 0; - //const BYTE matrix_add = !gt_simple_container.getMeta().isMixedPloidy(); - - if(gt_simple_container.getMeta().n_alleles + 2 > this->n_alleles_){ - std::cerr << "too many alleles: " << gt_simple_container.getMeta().n_alleles + 2 << "/" << (int)this->n_alleles_ << std::endl; - return; - } - - for(U32 i = 0; i < gt_simple_container.size(); ++i){ - const U64 length = YON_GT_RLE_LENGTH(gt_simple_container.at(i), shift, add); - const BYTE alleleA = YON_GT_RLE_ALLELE_A(gt_simple_container.at(i), shift, add); - const BYTE alleleB = YON_GT_RLE_ALLELE_B(gt_simple_container.at(i), shift, add); - this->matrix_[alleleA][alleleB] += length; - this->vectorA_[alleleA] += length; - this->vectorB_[alleleB] += length; - } - } - - template - inline void operator+=(const GenotypeContainerDiploidBCF& gt_diploid_bcf_container){ - const BYTE shift = (sizeof(T)*8 - 1) / 2; - - for(U32 i = 0; i < gt_diploid_bcf_container.size(); ++i){ - const U16 alleleA = YON_GT_DIPLOID_BCF_A(gt_diploid_bcf_container.at(i), shift); - const U16 alleleB = YON_GT_DIPLOID_BCF_B(gt_diploid_bcf_container.at(i), shift); - - this->matrix_[alleleA][alleleB] += 1; - this->vectorA_[alleleA] += 1; - this->vectorB_[alleleB] += 1; - } - } - - inline void operator+=(const bcf::BCFEntry& entry){ - if(entry.hasGenotypes == false) return; - - if(entry.body->n_allele + 2 > this->n_alleles_){ - std::cerr << "too many alleles: " << entry.body->n_allele + 2 << "/" << (int)this->n_alleles_ << std::endl; - return; - } - - U32 internal_pos = entry.formatID[0].l_offset; - U32 k = 0; - for(U32 i = 0; i < 2*entry.body->n_sample; i += 2, ++k){ - const BYTE& fmt_type_value1 = *reinterpret_cast(&entry.data[internal_pos++]); - const BYTE& fmt_type_value2 = *reinterpret_cast(&entry.data[internal_pos++]); - BYTE alleleA = fmt_type_value2 >> 1; - BYTE alleleB = fmt_type_value1 >> 1; - alleleA += (alleleA != 0 ? 1 : 0); - alleleB += (alleleB != 0 ? 1 : 0); - //std::cerr << (int)alleleA << "," << (int)alleleB << std::endl; - if(!(alleleA < this->n_alleles_ && alleleB < this->n_alleles_)){ - std::cerr << entry.body->n_allele << std::endl; - std::cerr << internal_pos << "/" << entry.l_data << std::endl; - std::cerr << "pos: " << i << "/" << 2*entry.body->n_sample << "@" << k << std::endl; - std::cerr << (int)alleleA << "," << (int)alleleB << std::endl; - std::cerr << std::bitset<8>(fmt_type_value2) << "," << std::bitset<8>(fmt_type_value1) << std::endl; - exit(1); - } - - ++this->matrix_[alleleA][alleleB]; - //++this->vectorA_[alleleA]; - //++this->vectorB_[alleleB]; - } - } - -private: - double __calculateHardyWeinberg(const U32 ref_target, const U32 alt_target) const{ - U64 obs_hets = this->matrix_[ref_target][alt_target] + this->matrix_[alt_target][ref_target]; - U64 obs_hom1 = this->matrix_[ref_target][ref_target]; - U64 obs_hom2 = this->matrix_[alt_target][alt_target]; - if(obs_hets + obs_hom1 + obs_hom2 == 0) return 1; - - U64 obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; - U64 obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; - - int64_t rare_copies = 2 * obs_homr + obs_hets; - int64_t genotypes = obs_hets + obs_homc + obs_homr; - - double* het_probs = new double[rare_copies + 1]; - - int64_t i; - for (i = 0; i <= rare_copies; ++i) het_probs[i] = 0.0; - - // start at midpoint - int64_t mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); - - // check to ensure that midpoint and rare alleles have same parity - if ((rare_copies & 1) ^ (mid & 1)) ++mid; - - int64_t curr_hets = mid; - int64_t curr_homr = (rare_copies - mid) / 2; - int64_t curr_homc = genotypes - curr_hets - curr_homr; - - het_probs[mid] = 1.0; - double sum = het_probs[mid]; - for (curr_hets = mid; curr_hets > 1; curr_hets -= 2){ - het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); - sum += het_probs[curr_hets - 2]; - - // 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote - ++curr_homr; - ++curr_homc; - } - - curr_hets = mid; - curr_homr = (rare_copies - mid) / 2; - curr_homc = genotypes - curr_hets - curr_homr; - for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2){ - het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); - sum += het_probs[curr_hets + 2]; - - // add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote - --curr_homr; - --curr_homc; - } - - for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum; - - double p_hwe = 0.0; - /* p-value calculation for p_hwe */ - for (i = 0; i <= rare_copies; i++){ - if (het_probs[i] > het_probs[obs_hets]) - continue; - - p_hwe += het_probs[i]; - } - - p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; - - delete [] het_probs; - - return(p_hwe); - } - -public: - BYTE n_alleles_; // number of alleles (including EOV and missing) - value_type** matrix_; - value_type* vectorA_; - value_type* vectorB_; -}; - -} -} - - - -#endif /* ENCODERGENOTYPESRLEHELPER_H_ */ diff --git a/lib/core/genotypes.cpp b/lib/core/genotypes.cpp new file mode 100644 index 0000000..5e612f4 --- /dev/null +++ b/lib/core/genotypes.cpp @@ -0,0 +1,352 @@ +#include "genotypes.h" + +namespace tachyon{ + +yon_gt::~yon_gt(){ + delete [] d_bcf; + delete [] d_bcf_ppa, + delete [] rcds; + delete [] d_exp; + if(d_occ != nullptr){ + for(U32 i = 0; i < this->n_o; ++i) + delete [] d_occ[i]; + } + delete [] n_occ; + delete [] d_occ; + delete itree; +} + +bool yon_gt_summary::AddGenotypeLayer(yon_gt_summary_obj* target, uint8_t depth){ + if(depth == this->n_ploidy) return false; + if(target->children == nullptr){ + target->children = new yon_gt_summary_obj[this->n_alleles]; + } + + for(U32 i = 0; i < this->n_alleles; ++i) + this->AddGenotypeLayer(&target->children[i], depth+1); + + return false; +} + +yon_gt_summary& yon_gt_summary::operator+=(const yon_gt& gt){ + assert(gt.rcds != nullptr); + + // Iterate over available genotype records. + for(U32 i = 0; i < gt.n_i; ++i){ + // Target root node. + yon_gt_summary_obj* target = &this->gt[gt.rcds[i].allele[0] >> 1]; + target->n_cnt += gt.rcds[i].run_length; + + // Iterate over alleles given the base ploidy. + for(U32 j = 0; j < gt.m; ++j){ + assert((gt.rcds[i].allele[j] >> 1) < this->n_alleles); + // Add allelic counts. + this->alleles[gt.rcds[i].allele[j] >> 1] += gt.rcds[i].run_length; + // Add strand-specific (ploidy-aware) alleleic counts. + this->alleles_strand[j][gt.rcds[i].allele[j] >> 1] += gt.rcds[i].run_length; + } + + // Update remainder. + for(U32 j = 1; j < gt.m; ++j){ + target = &target->children[gt.rcds[i].allele[j] >> 1]; + target->n_cnt += gt.rcds[i].run_length; + } + } + return(*this); +} + +std::vector yon_gt_summary::GetAlleleCounts(void) const{ + std::vector c_allele(this->n_alleles, 0); + for(U32 i = 0; i < this->n_alleles; ++i) + c_allele[i] = this->alleles[i]; + + return(c_allele); +} + +std::vector< std::pair > yon_gt_summary::GetAlleleCountFrequency(void) const{ + std::vector< std::pair > c_allele(this->n_alleles); + uint64_t n_total = 0; + for(U32 i = 0; i < 2; ++i){ + c_allele[i].first = this->alleles[i]; + } + + for(U32 i = 2; i < this->n_alleles; ++i){ + c_allele[i].first = this->alleles[i]; + n_total += this->alleles[i]; + } + + if(n_total != 0){ + for(U32 i = 0; i < this->n_alleles; ++i){ + c_allele[i].second = (double)c_allele[i].first / n_total; + } + } + + return(c_allele); +} + +std::vector< std::vector > yon_gt_summary::GetAlleleStrandCounts(void) const{ + std::vector< std::vector > c_allele(this->n_ploidy, std::vector(this->n_alleles)); + for(U32 i = 0; i < this->n_ploidy; ++i){ + for(U32 j = 0; j < this->n_alleles; ++j){ + c_allele[i][j] = this->alleles_strand[i][j]; + } + } + + return(c_allele); +} + +bool yon_gt_summary::GetGenotype(std::vector& data, + yon_gt_summary_obj* target, + uint8_t depth) const +{ + if(depth + 1 == this->n_ploidy){ + assert(target->children != nullptr); + for(U32 i = 0; i < this->n_alleles; ++i){ + if(target->children[i].n_cnt != 0){ + data.push_back(target->children[i].n_cnt); + } + } + return false; + } + + for(U32 i = 0; i < this->n_alleles; ++i){ + this->GetGenotype(data, &target->children[i], depth + 1); + } + + return(true); +} + +// Todo: unfinished. +std::vector yon_gt_summary::GetGenotypeCounts(bool drop_empty) const{ + std::vector genotypes; + + // Traverse the trie and store records when hitting the leafs. + std::vector d; + + // If the target ploidy is greater than one (haploid) we collect + // the genotypes by traversing the trie. Otherwise the genotypes + // are the allele counts at the roots. + if(this->n_ploidy > 1){ + for(U32 i = 0; i < this->n_alleles; ++i){ + //std::cerr << "outer " << i << "/" << (int)this->n_alleles << std::endl; + this->GetGenotype(d, &this->gt[i], 1); + } + } else { + for(U32 i = 0; i < this->n_alleles; ++i){ + if(this->gt[i].n_cnt != 0){ + d.push_back(this->gt[i].n_cnt); + } + } + } + uint64_t n_total = 0; + for(U32 i = 0; i < d.size(); ++i) + n_total += d[i]; + std::cerr << "collected: " << d.size() << " total = " << n_total << std::endl; + return(genotypes); +} + +std::vector yon_gt_summary::GetStrandBiasAlleles(const bool phred_scale) const{ + if(this->n_ploidy != 2 || this->n_alleles + 2 < 2) + return std::vector(); + + std::vector strand_bias_p_values; + double fisher_left_p, fisher_right_p, fisher_twosided_p; + + uint64_t n_cnt_fwd = 0; + uint64_t n_cnt_rev = 0; + for(U32 i = 2; i < this->n_alleles; ++i){ + n_cnt_fwd += this->alleles_strand[0][i]; + n_cnt_rev += this->alleles_strand[1][i]; + } + + kt_fisher_exact( + this->alleles_strand[0][2], // A: Allele on forward strand + this->alleles_strand[1][2], // B: Allele on reverse strand + n_cnt_fwd - this->alleles_strand[0][2], // C: Not allele on forward strand + n_cnt_rev - this->alleles_strand[1][2], // D: Not allele on reverse strand + &fisher_left_p, &fisher_right_p, &fisher_twosided_p); + + if(phred_scale) strand_bias_p_values.push_back(std::abs(-10 * log10(fisher_twosided_p))); + else strand_bias_p_values.push_back(fisher_twosided_p); + + // If n_alleles = 2 then they are identical because of symmetry + if(this->n_alleles - 2 > 2){ + for(U32 p = 3; p < this->n_alleles; ++p){ + kt_fisher_exact( + this->alleles_strand[0][p], // A: Allele on forward strand + this->alleles_strand[1][p], // B: Allele on reverse strand + n_cnt_fwd - this->alleles_strand[0][p], // C: Not allele on forward strand + n_cnt_rev - this->alleles_strand[1][p], // D: Not allele on reverse strand + &fisher_left_p, &fisher_right_p, &fisher_twosided_p); + + if(phred_scale) strand_bias_p_values.push_back(std::abs(-10 * log10(fisher_twosided_p))); + else strand_bias_p_values.push_back(fisher_twosided_p); + } + } + return(strand_bias_p_values); +} + +double yon_gt_summary::CalculateHardyWeinberg(void) const{ + if(this->n_ploidy != 2 || this->n_alleles - 2 != 2) return -1; + + U64 obs_hets = this->gt[2][3].n_cnt + this->gt[3][2].n_cnt; // alts + U64 obs_hom1 = this->gt[2][2].n_cnt; // hom ref + U64 obs_hom2 = this->gt[3][3].n_cnt; // hom alt + + U64 obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; + U64 obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; + + int64_t rare_copies = 2 * obs_homr + obs_hets; + int64_t genotypes = obs_hets + obs_homc + obs_homr; + + double* het_probs = new double[rare_copies + 1]; + + int64_t i; + for (i = 0; i <= rare_copies; ++i) + het_probs[i] = 0.0; + + /* start at midpoint */ + int64_t mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + + /* check to ensure that midpoint and rare alleles have same parity */ + if ((rare_copies & 1) ^ (mid & 1)) + ++mid; + + int64_t curr_hets = mid; + int64_t curr_homr = (rare_copies - mid) / 2; + int64_t curr_homc = genotypes - curr_hets - curr_homr; + + het_probs[mid] = 1.0; + double sum = het_probs[mid]; + for (curr_hets = mid; curr_hets > 1; curr_hets -= 2){ + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) + / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + ++curr_homr; + ++curr_homc; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2){ + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc + /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + --curr_homr; + --curr_homc; + } + + for (i = 0; i <= rare_copies; i++) + het_probs[i] /= sum; + + double p_hwe = 0.0; + /* p-value calculation for p_hwe */ + for (i = 0; i <= rare_copies; i++){ + if (het_probs[i] > het_probs[obs_hets]) + continue; + + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + + delete [] het_probs; + + return(p_hwe); +} + +bool yon_gt_summary::LazyEvaluate(void){ + delete this->d; + this->d = new yon_gt_summary_rcd; + + // Allele count and frequency + this->d->n_ploidy = this->n_ploidy; + this->d->n_ac_af = this->n_alleles; + this->d->ac = new uint64_t[this->n_alleles]; + this->d->af = new double[this->n_alleles]; + + uint64_t n_total = 0; + for(U32 i = 0; i < 2; ++i) this->d->ac[i] = this->alleles[i]; + for(U32 i = 2; i < this->n_alleles; ++i){ + this->d->ac[i] = this->alleles[i]; + n_total += this->alleles[i]; + } + + if(n_total != 0){ + for(U32 i = 0; i < this->n_alleles; ++i) + this->d->af[i] = (double)this->d->ac[i] / n_total; + + } else { + for(U32 i = 0; i < this->n_alleles; ++i) + this->d->af[i] = 0; + } + + this->d->npm = this->d->ac[0]; + this->d->nm = n_total; + this->d->an = n_total + this->d->ac[0]; + this->d->hwe_p = this->CalculateHardyWeinberg(); + this->d->ac_p = this->alleles_strand; + + // Strand-specific bias and inbreeding coefficient (F-statistic) + if(this->n_ploidy == 2){ + this->d->heterozygosity = ((double)this->gt[2][3].n_cnt + this->gt[3][2].n_cnt) / + (this->gt[2][2].n_cnt + this->gt[2][3].n_cnt + this->gt[3][2].n_cnt + this->gt[3][3].n_cnt); + + uint8_t n_fs_used = (this->n_alleles - 2 == 2 ? 1 : this->n_alleles - 2); + this->d->n_fs = n_fs_used; + this->d->fs_a = new double[n_fs_used]; + double fisher_left_p, fisher_right_p, fisher_twosided_p; + uint64_t n_cnt_fwd = 0; + uint64_t n_cnt_rev = 0; + for(U32 i = 2; i < this->n_alleles; ++i){ + n_cnt_fwd += this->alleles_strand[0][i]; + n_cnt_rev += this->alleles_strand[1][i]; + } + + kt_fisher_exact( + this->alleles_strand[0][2], // A: Allele on forward strand + this->alleles_strand[1][2], // B: Allele on reverse strand + n_cnt_fwd - this->alleles_strand[0][2], // C: Not allele on forward strand + n_cnt_rev - this->alleles_strand[1][2], // D: Not allele on reverse strand + &fisher_left_p, &fisher_right_p, &fisher_twosided_p); + + this->d->fs_a[0] = std::abs(-10 * log10(fisher_twosided_p)); + + // If n_alleles = 2 then they are identical because of symmetry + uint8_t pos = 1; + if(this->n_alleles - 2 > 2){ + for(U32 p = 3; p < this->n_alleles; ++p){ + kt_fisher_exact( + this->alleles_strand[0][p], // A: Allele on forward strand + this->alleles_strand[1][p], // B: Allele on reverse strand + n_cnt_fwd - this->alleles_strand[0][p], // C: Not allele on forward strand + n_cnt_rev - this->alleles_strand[1][p], // D: Not allele on reverse strand + &fisher_left_p, &fisher_right_p, &fisher_twosided_p); + + this->d->fs_a[pos++] = std::abs(-10 * log10(fisher_twosided_p)); + } + } + + // Total number of genotypes is the sum of the root + // nodes excluding special missing and sentinel node + // (0 and 1). + uint64_t n_genotypes = this->gt[2][2].n_cnt + this->gt[2][3].n_cnt + this->gt[3][2].n_cnt + this->gt[3][3].n_cnt; + + // Allele frequency of A + const double p = ((double)2*this->gt[2][2].n_cnt + this->gt[2][3].n_cnt + this->gt[3][2].n_cnt) / (2*n_genotypes); + // Genotype frequency of heterozyotes + const double pg = ((double)this->gt[2][3].n_cnt + this->gt[3][2].n_cnt) / n_genotypes; + // Expected heterozygosity + const double exp = 2*p*(1-p); + // Population inbreeding coefficient: F + const double f_pic = exp > 0 ? (exp-pg)/exp : 0; + this->d->f_pic = f_pic; + } +} + +} diff --git a/lib/core/genotypes.h b/lib/core/genotypes.h new file mode 100644 index 0000000..2755c16 --- /dev/null +++ b/lib/core/genotypes.h @@ -0,0 +1,834 @@ +#ifndef CORE_GENOTYPES_H_ +#define CORE_GENOTYPES_H_ + +#include + +#include "htslib/vcf.h" + +#include "io/basic_buffer.h" +#include "containers/components/generic_iterator.h" +#include "third_party/intervalTree.h" +#include "math/fisher_math.h" +#include "utility/support_vcf.h" + +namespace tachyon{ + +#define YON_GT_RLE_ALLELE_A(PRIMITITVE, SHIFT, ADD) (((PRIMITITVE) & ((1 << (SHIFT)) - 1) << (ADD)) >> (ADD)); +#define YON_GT_RLE_ALLELE_B(PRIMITIVE, SHIFT, ADD) (((PRIMITIVE) & ((1 << (SHIFT)) - 1) << ((ADD)+(SHIFT))) >> ((ADD)+(SHIFT))); +#define YON_GT_RLE_LENGTH(PRIMITIVE, SHIFT, ADD) ((PRIMITIVE) >> (2*(SHIFT) + (ADD))) +#define YON_GT_DIPLOID_ALLELE_LOOKUP(A,B,shift,mask) (((A) & (mask)) << (shift)) | ((B) & (mask)) +#define YON_GT_DIPLOID_BCF_A(PRIMITIVE, SHIFT) (((PRIMITIVE) >> ((SHIFT) + 1)) & (((U64)1 << (SHIFT)) - 1)) +#define YON_GT_DIPLOID_BCF_B(PRIMITIVE, SHIFT) (((PRIMITIVE) >> 1) & (((U64)1 << (SHIFT)) - 1)) +#define YON_GT_DIPLOID_BCF_PHASE(PRIMITIVE) ((PRIMITIVE) & 1) + +#define YON_GT_UN_NONE 0 // nothing +#define YON_GT_UN_INT 1 // rcds +#define YON_GT_UN_BCF 2|YON_GT_UN_INT // bcf +#define YON_GT_UN_SIMPLE 4|YON_GT_UN_INT // simple +#define YON_GT_UN_BCF_PPA 8|YON_GT_UN_BCF // bcf unpermuted +#define YON_GT_UN_SIMPLE_PPA 16|YON_GT_UN_SIMPLE // bcf unpermuted +#define YON_GT_UN_ALL (YON_GT_UN_BCF_PPA|YON_GT_UN_SIMPLE_PPA) // everything + +// 0 for missing and 1 for sentinel node. Note that the +// sentinel node never occurs in this encoding type. +const uint8_t YON_GT_RLE_RECODE[3] = {2, 3, 0}; + +// Vcf:INFO names for fields annotated when triggering +// annotation of genotypes. +const std::vector< std::string > YON_GT_ANNOTATE_FIELDS = {"NM","NPM","AN","HWE_P","AC","AF","AC_P","FS_A","F_PIC","HET","MULTI_ALLELIC"}; + +// Basic structure that maintains the permutation +// order of the samples in relation to the global header. +// This object is required if you want to use individual +// genotypes in the ORIGINAL order. If this is not required +// in your use-case then this structure has no value. +struct yon_gt_ppa { + yon_gt_ppa(void) : n_samples(0), ordering(nullptr){} + yon_gt_ppa(const uint32_t n_samples) : n_samples(n_samples), ordering(new uint32_t[n_samples]){ this->reset(); } + ~yon_gt_ppa(void){ delete [] this->ordering; } + + uint32_t& operator[](const uint32_t& position){ return(this->ordering[position]); } + const uint32_t& operator[](const uint32_t& position) const{ return(this->ordering[position]); } + uint32_t& at(const uint32_t& position){ return(this->ordering[position]); } + const uint32_t& at(const uint32_t& position) const{ return(this->ordering[position]); } + + void Allocate(const uint32_t n_samples){ + delete [] this->ordering; + this->n_samples = n_samples; + this->ordering = new uint32_t[n_samples]; + this->reset(); + } + + void reset(void){ + for(U32 i = 0; i < this->n_samples; ++i) + this->ordering[i] = i; + } + + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, yon_gt_ppa& ppa){ + io::DeserializePrimitive(ppa.n_samples, buffer); + ppa.ordering = new uint32_t[ppa.n_samples]; + for(U32 i = 0; i < ppa.n_samples; ++i) + io::DeserializePrimitive(ppa.ordering[i], buffer); + + return(buffer); + } + + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const yon_gt_ppa& ppa){ + io::SerializePrimitive(ppa.n_samples, buffer); + for(U32 i = 0; i < ppa.n_samples; ++i) + io::SerializePrimitive(ppa.ordering[i], buffer); + + return(buffer); + } + + uint32_t n_samples; + uint32_t* ordering; +}; + +struct yon_radix_gt { + yon_radix_gt() : n_ploidy(0), n_allocated(4), id(0), alleles(new uint16_t[this->n_allocated]){ + memset(this->alleles, 0, sizeof(uint16_t)*this->n_allocated); + } + ~yon_radix_gt(){ delete [] this->alleles; } + yon_radix_gt(const yon_radix_gt& other) : n_ploidy(other.n_ploidy), n_allocated(other.n_allocated), id(other.id), alleles(new uint16_t[this->n_allocated]) + { + memcpy(this->alleles, other.alleles, sizeof(uint16_t)*this->n_allocated); + } + yon_radix_gt(yon_radix_gt&& other) : n_ploidy(other.n_ploidy), n_allocated(other.n_allocated), id(other.id), alleles(other.alleles) + { + other.alleles = nullptr; + } + + yon_radix_gt& operator=(const yon_radix_gt& other) // copy assignment + { + this->id = other.id; + this->n_ploidy = other.n_ploidy; + this->n_allocated = other.n_allocated; + delete [] this->alleles; + this->alleles = new uint16_t[this->n_allocated]; + memcpy(this->alleles, other.alleles, sizeof(uint16_t)*this->n_allocated); + return *this; + } + yon_radix_gt& operator=(yon_radix_gt&& other) // move assignment + { + if(this!=&other) // prevent self-move + { + this->id = other.id; + this->n_ploidy = other.n_ploidy; + this->n_allocated = other.n_allocated; + delete [] this->alleles; + this->alleles = other.alleles; + other.alleles = nullptr; + } + return *this; + } + + bool operator<(const yon_radix_gt& other) const{ + // Do not compare incremental sample identification + // numbers as that is not the desired outcome of + // the sort. + if(this->n_ploidy < other.n_ploidy) return true; + if(other.n_ploidy < this->n_ploidy) return false; + + for(U32 i = 0; i < this->n_ploidy; ++i){ + if(this->alleles[i] < other.alleles[i]) + return true; + } + return false; + } + + bool operator==(const yon_radix_gt& other) const{ + // Do not compare incremental sample identification + // numbers as that is not the desired outcome of + // the comparison. + if(this->n_ploidy != other.n_ploidy) + return false; + + for(U32 i = 0; i < this->n_ploidy; ++i){ + if(this->alleles[i] != other.alleles[i]) + return false; + } + return true; + } + + inline bool operator!=(const yon_radix_gt& other) const{ return(!(*this == other)); } + + friend std::ostream& operator<<(std::ostream& stream, const yon_radix_gt& genotype){ + stream << genotype.id << ":"; + if(genotype.n_ploidy){ + stream << genotype.alleles[0]; + for(U32 i = 1; i < genotype.n_ploidy; ++i){ + stream << "," << genotype.alleles[i]; + } + } + return(stream); + } + + U64 GetPackedInteger(const uint8_t& shift_size = 8) const{ + U64 packed = 0; + for(U32 i = 0; i < this->n_ploidy; ++i){ + packed <<= shift_size; + assert(((this->alleles[i] << shift_size) >> shift_size) == this->alleles[i]); + packed |= (this->alleles[i] & ((1 << shift_size)) - 1); + } + return packed; + } + + void resize(const uint8_t new_ploidy){ + uint16_t* temp = new uint16_t[new_ploidy]; + memcpy(temp, this->alleles, this->n_allocated * sizeof(uint16_t)); + delete [] this->alleles; + this->alleles = temp; + this->n_allocated = new_ploidy; + } + + uint8_t n_ploidy; + uint8_t n_allocated; + uint64_t id; + uint16_t* alleles; +}; + +// Primary generic Tachyon FORMAT:GT structure. +struct yon_gt_rcd { + yon_gt_rcd() : run_length(0), allele(nullptr){} + ~yon_gt_rcd(){ delete [] this->allele; } + yon_gt_rcd(const yon_gt_rcd& other) = delete; // disallow copy ctor + yon_gt_rcd& operator=(const yon_gt_rcd& other) = delete; // disallow move assign + yon_gt_rcd(yon_gt_rcd&& other) : run_length(other.run_length), allele(other.allele){ + other.allele = nullptr; + } + yon_gt_rcd& operator=(yon_gt_rcd&& other){ + if(this == &other) return(*this); + delete this->allele; + this->allele = other.allele; + other.allele = nullptr; + this->run_length = other.run_length; + return(*this); + } + + // Rule of 5 + + io::BasicBuffer& PrintVcf(io::BasicBuffer& buffer, const uint8_t& n_ploidy){ + if(this->allele[0] == 1){ + buffer += '.'; + return(buffer); + } + if(this->allele[0] == 0) buffer += '.'; + else buffer.AddReadble(((this->allele[0] >> 1) - 2)); + + for(U32 i = 1; i < n_ploidy; ++i){ + if(this->allele[i] == 1) break; + buffer += ((this->allele[i] & 1) ? '|' : '/'); + if(this->allele[i] == 0) buffer += '.'; + else buffer.AddReadble(((this->allele[i] >> 1) - 2)); + } + return(buffer); + } + + uint32_t run_length; + //uint8_t alleles; // Determined from base ploidy + uint8_t* allele; // contains phase at first bit +}; + +// Forward declare. +struct yon_gt_summary; + +struct yon_gt { + uint8_t add : 7, + global_phase : 1; + uint8_t shift; + uint8_t p, m, method; // bytes per entry, base ploidy, base method + uint32_t n_s, n_i, n_o; // number samples, number of entries + uint8_t n_allele; + yon_gt_ppa* ppa; // pointer to ppa + std::vector< std::vector >* occ; // pointer to occ + uint8_t* data; // pointer to data + uint8_t* d_bcf; // lazy evaluated as Bcf entries (length = base_ploidy * n_samples * sizeof(uint8_t)) + uint8_t* d_bcf_ppa; // lazy evaluation of unpermuted bcf records + yon_gt_rcd** d_exp; // lazy evaluated from ppa/normal to internal offset (length = n_samples). This can be very expensive if evaluated internally for every record. + yon_gt_rcd* rcds; // lazy interpreted internal records + uint32_t* n_occ; + yon_gt_rcd** d_occ; // lazy evaluation of occ table + algorithm::IntervalTree* itree; // interval tree for consecutive ranges + bool dirty; + + typedef yonRawIterator iterator; + typedef yonRawIterator const_iterator; + + yon_gt() : add(0), global_phase(0), shift(0), p(0), m(0), method(0), n_s(0), n_i(0), n_o(0), + n_allele(0), ppa(nullptr), occ(nullptr), data(nullptr), d_bcf(nullptr), + d_bcf_ppa(nullptr), d_exp(nullptr), rcds(nullptr), n_occ(nullptr), d_occ(nullptr), + itree(nullptr), dirty(false) + {} + + ~yon_gt(); + + bool Evaluate(void){ + if(this->method == 1) return(this->EvaluateRecordsM1()); + else if(this->method == 2) return(this->EvaluateRecordsM2()); + else if(this->method == 4) return(this->EvaluateRecordsM4()); + else { + std::cerr << "not implemented method " << (int)this->method << std::endl; + } + return false; + } + + bool EvaluateRecordsM1(){ + switch(this->p){ + case(1): return(this->EvaluateRecordsM1_()); + case(2): return(this->EvaluateRecordsM1_()); + case(4): return(this->EvaluateRecordsM1_()); + case(8): return(this->EvaluateRecordsM1_()); + default: + std::cerr << "illegal primitive in EvaluateRecordsM1" << std::endl; + exit(1); + } + } + + template + bool EvaluateRecordsM1_(){ + if(this->rcds != nullptr) delete [] this->rcds; + assert(this->m == 2); + assert(this->n_allele == 2); + + // Allocate memory for new records. + this->rcds = new yon_gt_rcd[this->n_i]; + + // Reinterpret byte stream into the appropriate actual + // primitive type. + const T* r_data = reinterpret_cast(this->data); + + // Keep track of the cumulative number of genotypes observed + // as a means of asserting correctness. + uint64_t n_total = 0; + + // Iterate over the internal run-length encoded genotypes + // and populate the rcds structure. + for(uint32_t i = 0; i < this->n_i; ++i){ + BYTE phasing = 0; + if(add) phasing = r_data[i] & 1; + else phasing = this->global_phase; + + this->rcds[i].run_length = YON_GT_RLE_LENGTH(r_data[i], shift, add); + this->rcds[i].allele = new uint8_t[2]; + this->rcds[i].allele[0] = YON_GT_RLE_ALLELE_B(r_data[i], shift, add); + this->rcds[i].allele[1] = YON_GT_RLE_ALLELE_A(r_data[i], shift, add); + // Store an allele encoded as (ALLELE << 1 | phasing). + this->rcds[i].allele[0] = (YON_GT_RLE_RECODE[this->rcds[i].allele[0]] << 1) | phasing; + this->rcds[i].allele[1] = (YON_GT_RLE_RECODE[this->rcds[i].allele[1]] << 1) | phasing; + n_total += this->rcds[i].run_length; + } + assert(n_total == this->n_s); + } + + bool EvaluateRecordsM2(){ + switch(this->p){ + case(1): return(this->EvaluateRecordsM2_()); + case(2): return(this->EvaluateRecordsM2_()); + case(4): return(this->EvaluateRecordsM2_()); + case(8): return(this->EvaluateRecordsM2_()); + default: + std::cerr << "illegal primitive in EvaluateRecordsM2" << std::endl; + exit(1); + } + } + + template + bool EvaluateRecordsM2_(){ + if(this->rcds != nullptr) delete [] this->rcds; + assert(this->m == 2); + + // Allocate memory for new records. + this->rcds = new yon_gt_rcd[this->n_i]; + + // Reinterpret byte stream into the appropriate actual + // primitive type. + const T* r_data = reinterpret_cast(this->data); + + // Keep track of the cumulative number of genotypes observed + // as a means of asserting correctness. + uint64_t n_total = 0; + + // Iterate over the internal run-length encoded genotypes + // and populate the rcds structure. + for(uint32_t i = 0; i < this->n_i; ++i){ + BYTE phasing = 0; + if(add) phasing = r_data[i] & 1; + else phasing = this->global_phase; + + this->rcds[i].run_length = YON_GT_RLE_LENGTH(r_data[i], shift, add); + this->rcds[i].allele = new uint8_t[2]; + this->rcds[i].allele[0] = YON_GT_RLE_ALLELE_B(r_data[i], shift, add); + this->rcds[i].allele[1] = YON_GT_RLE_ALLELE_A(r_data[i], shift, add); + // Store an allele encoded as (ALLELE << 1 | phasing). + this->rcds[i].allele[0] = (this->rcds[i].allele[0] << 1) | phasing; + this->rcds[i].allele[1] = (this->rcds[i].allele[1] << 1) | phasing; + n_total += this->rcds[i].run_length; + } + assert(n_total == this->n_s); + } + + bool EvaluateRecordsM4(){ + switch(this->p){ + case(1): return(this->EvaluateRecordsM4_()); + case(2): return(this->EvaluateRecordsM4_()); + case(4): return(this->EvaluateRecordsM4_()); + case(8): return(this->EvaluateRecordsM4_()); + default: + std::cerr << "illegal primitive in EvaluateRecordsM1" << std::endl; + exit(1); + } + } + + template + bool EvaluateRecordsM4_(){ + if(this->rcds != nullptr) delete [] this->rcds; + assert(this->m != 2); + + // Allocate memory for new records. + this->rcds = new yon_gt_rcd[this->n_i]; + + // Keep track of the cumulative number of genotypes observed + // as a means of asserting correctness. + uint64_t n_total = 0; + uint64_t b_offset = 0; + + // Iterate over the internal run-length encoded genotypes + // and populate the rcds structure. + for(uint32_t i = 0; i < this->n_i; ++i){ + T* run_length = reinterpret_cast(&this->data[b_offset]); + b_offset += sizeof(T); + + this->rcds[i].run_length = *run_length; + this->rcds[i].allele = new uint8_t[this->m]; + for(U32 j = 0; j < this->m; ++j, ++b_offset) + this->rcds[i].allele[j] = this->data[b_offset]; + + n_total += this->rcds[i].run_length; + } + assert(n_total == this->n_s); + } + + // Requires base evaluation to rcds structures first. + bool EvaluateBcf(){ + if(this->rcds == nullptr){ + std::cerr << "have to evaluate rcds first" << std::endl; + return false; + } + + switch(this->p){ + case(1): return(this->EvaluateBcf_()); + case(2): return(this->EvaluateBcf_()); + case(4): return(this->EvaluateBcf_()); + case(8): return(this->EvaluateBcf_()); + default: + std::cerr << "illegal primitive in EvaluateBcf" << std::endl; + exit(1); + } + } + + template + bool EvaluateBcf_(){ + assert(this->rcds != nullptr); + if(this->d_bcf != nullptr) delete [] this->d_bcf; + + uint64_t cum_pos = 0; + this->d_bcf = new uint8_t[this->m * this->n_s]; + for(uint32_t i = 0; i < this->n_i; ++i){ + for(uint32_t j = 0; j < this->rcds[i].run_length; ++j){ + for(uint32_t k = 0; k < this->m; ++k, ++cum_pos){ + this->d_bcf[cum_pos] = this->rcds[i].allele[k]; // Todo: recode back from YON-style to htslib style (e.g. 0,1 special meaning in YON) + } + } + } + assert(cum_pos == this->n_s * this->m); + return true; + } + + bool EvaluatePpaFromBcf(const uint8_t* d_bcf_pre){ + assert(d_bcf_pre != nullptr); + assert(this->ppa != nullptr); + + if(this->d_bcf_ppa != nullptr) delete [] this->d_bcf_ppa; + this->d_bcf_ppa = new uint8_t[this->n_s * this->m]; + + uint64_t cum_pos = 0; + for(uint32_t i = 0; i < this->n_s; ++i){ + const uint32_t& ppa_target = this->ppa->ordering[i]; + const uint8_t* bcf_target = &d_bcf_pre[ppa_target * this->m]; + + for(uint32_t k = 0; k < this->m; ++k, ++cum_pos){ + this->d_bcf_ppa[cum_pos] = bcf_target[k]; + } + } + + assert(cum_pos == this->n_s * this->m); + return true; + } + + bool EvaluateIntervalTree(void){ + assert(this->rcds != nullptr); + if(this->itree != nullptr) delete this->itree; + + std::vector< algorithm::Interval > intervals; + uint64_t cum_pos = 0; + for(uint32_t i = 0; i < this->n_i; ++i){ + intervals.push_back(algorithm::Interval( + cum_pos, + cum_pos + this->rcds[i].run_length, + &this->rcds[i]) + ); + cum_pos += this->rcds[i].run_length; + } + this->itree = new algorithm::IntervalTree(std::move(intervals)); + + assert(cum_pos == this->n_s); + return true; + } + + /**< + * Lazy evaluated (expands) (possibly) run-length encoded rcds structures + * into a vector of pointers corresponding to distinct samples in the order + * they were stored. This unpermutation (restoration) of order requires the + * ppa array (stored at YON_BLK_PPA). + * + * This function should be considered private as the generic wrapper function + * Expand() calls this function if ppa is available or ExpandRecords() + * otherwise. + * @return Always returns TRUE if the critical assertions passes. + */ + bool ExpandRecordsPpa(void){ + assert(this->rcds != nullptr); + assert(this->ppa != nullptr); + + if(this->d_exp != nullptr) delete [] this->d_exp; + this->d_exp = new yon_gt_rcd*[this->n_s]; + + uint64_t cum_sample = 0; + uint64_t cum_offset = 0; + for(uint32_t i = 0; i < this->n_i; ++i){ + for(uint32_t j = 0; j < this->rcds[i].run_length; ++j, ++cum_sample){ + const uint32_t& target_ppa = this->ppa->at(cum_sample); + this->d_exp[target_ppa] = &this->rcds[i]; + cum_offset += this->p * this->m; + } + } + assert(cum_sample == this->n_s); + assert(cum_offset == this->n_s * this->m * this->p); + return true; + } + + bool ExpandRecordsPpaExternal(yon_gt_rcd** d_expe){ + assert(this->rcds != nullptr); + assert(this->ppa != nullptr); + + uint64_t cum_sample = 0; + uint64_t cum_offset = 0; + for(uint32_t i = 0; i < this->n_i; ++i){ + for(uint32_t j = 0; j < this->rcds[i].run_length; ++j, ++cum_sample){ + const uint32_t& target_ppa = this->ppa->at(cum_sample); + d_expe[target_ppa] = &this->rcds[i]; + cum_offset += this->p * this->m; + } + } + assert(cum_sample == this->n_s); + assert(cum_offset == this->n_s * this->m * this->p); + return true; + } + + bool Expand(void){ + if(this->ppa != nullptr) + return(this->ExpandRecordsPpa()); + else return(this->ExpandRecords()); + } + + bool ExpandExternal(yon_gt_rcd** d_expe){ + if(this->ppa != nullptr) + return(this->ExpandRecordsPpaExternal(d_expe)); + else return(this->ExpandRecordsExternal(d_expe)); + } + + bool ExpandRecords(void){ + assert(this->rcds != nullptr); + + if(this->d_exp != nullptr) delete [] this->d_exp; + this->d_exp = new yon_gt_rcd*[this->n_s]; + + uint64_t cum_sample = 0; + uint64_t cum_offset = 0; + for(uint32_t i = 0; i < this->n_i; ++i){ + for(uint32_t j = 0; j < this->rcds[i].run_length; ++j, ++cum_sample){ + this->d_exp[cum_sample] = &this->rcds[i]; + cum_offset += this->p * this->m; + } + } + assert(cum_sample == this->n_s); + assert(cum_offset == this->n_s * this->m * this->p); + return true; + } + + bool ExpandRecordsExternal(yon_gt_rcd** d_expe){ + assert(this->rcds != nullptr); + + uint64_t cum_sample = 0; + uint64_t cum_offset = 0; + for(uint32_t i = 0; i < this->n_i; ++i){ + for(uint32_t j = 0; j < this->rcds[i].run_length; ++j, ++cum_sample){ + d_expe[cum_sample] = &this->rcds[i]; + cum_offset += this->p * this->m; + } + } + assert(cum_sample == this->n_s); + assert(cum_offset == this->n_s * this->m * this->p); + return true; + } + + template + inline T& GetPrimitive(const uint32_t sample){ return(*reinterpret_cast(&this->data[sample])); } + + template + inline T& GetPrimitivePpa(const uint32_t sample){ return(this->ppa[*reinterpret_cast(&this->data[sample])]); } + + // Iterator + inline iterator begin(){ return iterator(&this->rcds[0]); } + inline iterator end() { return iterator(&this->rcds[this->n_i]); } + inline const_iterator begin() const{ return const_iterator(&this->rcds[0]); } + inline const_iterator end() const{ return const_iterator(&this->rcds[this->n_i]); } + inline const_iterator cbegin() const{ return const_iterator(&this->rcds[0]); } + inline const_iterator cend() const{ return const_iterator(&this->rcds[this->n_i]); } + + bcf1_t* UpdateHtslibGenotypes(bcf1_t* rec, bcf_hdr_t* hdr) const{ + assert(this->d_exp != nullptr); + + int32_t* tmpi = new int32_t[this->n_s*this->m]; + uint32_t gt_offset = 0; + for(U32 i = 0; i < this->n_s; ++i){ + for(U32 j = 0; j < this->m; ++j, ++gt_offset){ + if(this->d_exp[i]->allele[j] == 0) tmpi[gt_offset] = 0; + else if(this->d_exp[i]->allele[j] == 1) tmpi[gt_offset] = 1; + else tmpi[gt_offset] = (((this->d_exp[i]->allele[j] >> 1) - 1) << 1) | (this->d_exp[i]->allele[j] & 1); + } + } + assert(gt_offset == this->n_s*this->m); + + bcf_update_genotypes(hdr, rec, tmpi, this->n_s*this->m); + delete [] tmpi; + return(rec); + } +}; + +struct yon_gt_summary_obj{ + yon_gt_summary_obj() : n_cnt(0), children(nullptr){} + ~yon_gt_summary_obj(){ delete [] this->children; } + + inline yon_gt_summary_obj& operator[](const uint32_t pos){ return(this->children[pos]); } + inline const yon_gt_summary_obj& operator[](const uint32_t pos) const{ return(this->children[pos]); } + + uint64_t n_cnt; + yon_gt_summary_obj* children; +}; + +struct yon_gt_summary_rcd { + yon_gt_summary_rcd() : + n_ploidy(0), n_ac_af(0), n_fs(0), ac(nullptr), af(nullptr), + nm(0), npm(0), an(0), ac_p(nullptr), fs_a(nullptr), hwe_p(0), + f_pic(0), heterozygosity(0) + {} + ~yon_gt_summary_rcd(){ + delete [] this->ac; delete [] this->af; + delete [] this->fs_a; + // Do not delete ac_p as it is borrowed + } + + bcf1_t* UpdateHtslibVcfRecord(bcf1_t* rec, bcf_hdr_t* hdr) const{ + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "NM", (const U64*)&this->nm, 1); + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "NPM", (const U64*)&this->npm, 1); + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "AN", (const U64*)&this->an, 1); + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "HWE_P", (const double*)&this->hwe_p, 1); + + if(this->n_ac_af > 4) + bcf_update_info_flag(hdr, rec, "MULTI_ALLELIC", NULL, 1); + + if(this->n_ac_af > 2){ + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "AC", (const U64*)&this->ac[2], this->n_ac_af - 2); + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "AF", (const double*)&this->af[2], this->n_ac_af - 2); + } + + std::vector ac_p; + for(U32 p = 0; p < this->n_ploidy; ++p){ + for(U32 i = 2; i < this->n_ac_af; ++i){ + ac_p.push_back(this->ac_p[p][i]); + } + } + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "AC_P", (const U64*)ac_p.data(), ac_p.size()); + + if(this->fs_a != nullptr) + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "FS_A", this->fs_a, this->n_fs); + + if(this->n_ploidy == 2){ + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "FPIC", &this->f_pic, 1); + utility::UpdateHtslibVcfRecordInfo(rec, hdr, "HET", &this->heterozygosity, 1); + } + + return(rec); + } + + io::BasicBuffer& PrintVcf(io::BasicBuffer& buffer){ + buffer += "NM="; buffer.AddReadble((U64)this->nm); + buffer += ";NPM="; buffer.AddReadble((U64)this->npm); + buffer += ";AN="; buffer.AddReadble((U64)this->an); + buffer += ";HWE_P="; buffer.AddReadble((double)this->hwe_p); + if(this->n_ac_af > 4) buffer += ";MULTI_ALLELIC"; + + if(this->n_ac_af > 2){ + buffer += ";AC="; buffer.AddReadble((U64)this->ac[2]); + for(U32 i = 3; i < this->n_ac_af; ++i){ + buffer += ','; buffer.AddReadble((U64)this->ac[i]); + } + buffer += ";AF="; buffer.AddReadble((double)this->af[2]); + for(U32 i = 3; i < this->n_ac_af; ++i){ + buffer += ','; buffer.AddReadble((double)this->af[i]); + } + } + + buffer += ";AC_P="; + buffer.AddReadble((U64)this->ac_p[0][2]); + for(U32 i = 3; i < this->n_ac_af; ++i){ + buffer += ','; buffer.AddReadble((U64)this->ac_p[0][i]); + } + + for(U32 p = 1; p < this->n_ploidy; ++p){ + for(U32 i = 2; i < this->n_ac_af; ++i){ + buffer += ','; buffer.AddReadble((U64)this->ac_p[p][i]); + } + } + + if(this->fs_a != nullptr){ + buffer += ";FS_A="; + buffer.AddReadble((double)this->fs_a[0]); + for(U32 i = 1; i < this->n_fs; ++i){ + buffer += ','; buffer.AddReadble((double)this->fs_a[i]); + } + } + + if(this->n_ploidy == 2){ + buffer += ";F_PIC="; + buffer.AddReadble((double)this->f_pic); + buffer += ";HET="; + buffer.AddReadble((double)this->heterozygosity); + } + + return(buffer); + } + + uint8_t n_ploidy; + uint32_t n_ac_af; + uint32_t n_fs; + uint64_t* ac; // allele counts + double* af; // allele frequency + uint64_t nm; // number of non-sentinel, non-missing symbols + uint64_t npm;// number of missing symbols + uint64_t an; // number of non-sentinel symbols + // ac_p is borrowed from summary + uint64_t** ac_p; + double* fs_a;// fisher strand test p + double hwe_p;// hardy-weinberg p + double f_pic; + double heterozygosity; +}; + +struct yon_gt_summary{ + yon_gt_summary(void) : + n_ploidy(0), + n_alleles(0), + alleles(nullptr), + alleles_strand(nullptr), + gt(nullptr), + d(nullptr) + { + + } + + yon_gt_summary(const uint8_t base_ploidy, const uint8_t n_alleles) : + n_ploidy(base_ploidy), + n_alleles(n_alleles + 2), + alleles(new uint64_t[this->n_alleles]), + alleles_strand(new uint64_t*[this->n_ploidy]), + gt(new yon_gt_summary_obj[this->n_alleles]), + d(nullptr) + { + memset(this->alleles, 0, sizeof(uint64_t)*this->n_alleles); + for(U32 i = 0; i < this->n_ploidy; ++i){ + this->alleles_strand[i] = new uint64_t[this->n_alleles]; + memset(this->alleles_strand[i], 0, sizeof(uint64_t)*this->n_alleles); + } + + // Add layers to the root node. + for(U32 i = 0; i < this->n_alleles; ++i) + this->AddGenotypeLayer(&this->gt[i], 1); + } + + ~yon_gt_summary(){ + delete [] this->alleles; + delete [] this->gt; + if(this->alleles_strand != nullptr){ + for(U32 i = 0; i < this->n_ploidy; ++i) + delete this->alleles_strand[i]; + delete [] this->alleles_strand; + } + delete this->d; + } + + /**< + * Recursively add layers to the full trie in order to represent + * a possible ploidy-dimensional matrix at the leafs. The memory + * cost of the trie is O(n_alleles ^ ploidy) and allows the prefix + * lookup of any genotype. + * + * @param target + * @param depth + * @return + */ + bool AddGenotypeLayer(yon_gt_summary_obj* target, uint8_t depth); + + /**< + * + * @param gt + * @return + */ + yon_gt_summary& operator+=(const yon_gt& gt); + + // Accessors to internal data. + inline uint64_t* GetAlleleCountsRaw(void){ return(this->alleles); } + inline const uint64_t* GetAlleleCountsRaw(void) const{ return(this->alleles); } + + std::vector GetAlleleCounts(void) const; + std::vector< std::pair > GetAlleleCountFrequency(void) const; + std::vector< std::vector > GetAlleleStrandCounts(void) const; + bool GetGenotype(std::vector& data, + yon_gt_summary_obj* target, + uint8_t depth) const; + + // Todo: unfinished. + std::vector GetGenotypeCounts(bool drop_empty = true) const; + std::vector GetStrandBiasAlleles(const bool phred_scale = true) const; + double CalculateHardyWeinberg(void) const; + + /**< + * + * @return + */ + bool LazyEvaluate(void); + + uint8_t n_ploidy; // base ploidy at site + uint8_t n_alleles; // number of alleles + uint64_t* alleles; // allele counts + uint64_t** alleles_strand; // allelic counts per chromosome + yon_gt_summary_obj* gt; // full genotypic trie with branch-size n_alleles + yon_gt_summary_rcd* d; // lazy evaluated record +}; + + +} + +#endif /* CORE_GENOTYPES_H_ */ diff --git a/lib/core/header/header_contig.h b/lib/core/header/header_contig.h deleted file mode 100644 index a10ba33..0000000 --- a/lib/core/header/header_contig.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef CORE_HEADERCONTIG_H_ -#define CORE_HEADERCONTIG_H_ - -#include -#include "io/basic_buffer.h" - -namespace tachyon{ -namespace core{ - -struct HeaderContig{ - typedef HeaderContig self_type; - -public: - HeaderContig() : contigID(0), bp_length(0), n_blocks(0){} - ~HeaderContig(){} - - inline void operator++(void){ ++this->n_blocks; } - inline void operator--(void){ --this->n_blocks; } - template inline void operator+=(const T value){ this->n_blocks += value; } - template inline void operator-=(const T value){ this->n_blocks -= value; } - -private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - const U32 l_name = entry.name.size(); - stream.write(reinterpret_cast(&l_name), sizeof(U32)); - stream.write(reinterpret_cast(&entry.contigID), sizeof(U32)); - stream.write(reinterpret_cast(&entry.bp_length), sizeof(U64)); - stream.write(reinterpret_cast(&entry.n_blocks), sizeof(U32)); - stream.write(&entry.name[0], entry.name.size()); - return(stream); - } - - friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry){ - U32 l_name = 0; - stream.read(reinterpret_cast(&l_name), sizeof(U32)); - stream.read(reinterpret_cast(&entry.contigID), sizeof(U32)); - stream.read(reinterpret_cast(&entry.bp_length), sizeof(U64)); - stream.read(reinterpret_cast(&entry.n_blocks), sizeof(U32)); - entry.name.resize(l_name); - stream.read(&entry.name[0], l_name); - - return(stream); - } - - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& contig){ - buffer += (U32)contig.name.size(); - buffer += contig.contigID; - buffer += contig.bp_length; - buffer += contig.n_blocks; - buffer.Add(&contig.name[0], contig.name.size()); - return(buffer); - } - - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& contig){ - U32 l_name; - buffer >> l_name; - buffer >> contig.contigID; - buffer >> contig.bp_length; - buffer >> contig.n_blocks; - contig.name.resize(l_name); - buffer.read(&contig.name[0], l_name); - return(buffer); - } - -public: - U32 contigID; - U64 bp_length; - U32 n_blocks; - std::string name; -}; - -} -} - -#endif /* CORE_HEADERCONTIG_H_ */ diff --git a/lib/core/header/header_magic.cpp b/lib/core/header/header_magic.cpp deleted file mode 100644 index 55a62fa..0000000 --- a/lib/core/header/header_magic.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "header_magic.h" - -namespace tachyon{ -namespace core{ - -HeaderMagic::HeaderMagic() : - major_version(tachyon::constants::TACHYON_VERSION_MAJOR), - minor_version(tachyon::constants::TACHYON_VERSION_MINOR), - patch_version(tachyon::constants::TACHYON_VERSION_PATCH), - controller(0), - n_samples(0), - n_contigs(0), - n_info_values(0), - n_format_values(0), - n_filter_values(0), - l_literals(0), - l_header(0), - l_header_uncompressed(0) -{ - //memcpy(&this->magic_string[0], - // &tachyon::constants::FILE_HEADER[0], - // tachyon::constants::FILE_HEADER_LENGTH); -} - -HeaderMagic::HeaderMagic(const self_type& other) : - major_version(other.major_version), - minor_version(other.minor_version), - patch_version(other.patch_version), - controller(other.controller), - n_samples(other.n_samples), - n_contigs(other.n_contigs), - n_info_values(other.n_info_values), - n_format_values(other.n_format_values), - n_filter_values(other.n_filter_values), - l_literals(other.l_literals), - l_header(other.l_header), - l_header_uncompressed(other.l_header_uncompressed) -{ - //memcpy(&this->magic_string[0], - // &other.magic_string[0], - // tachyon::constants::FILE_HEADER_LENGTH); -} - -} -} diff --git a/lib/core/header/header_magic.h b/lib/core/header/header_magic.h deleted file mode 100644 index fe61bae..0000000 --- a/lib/core/header/header_magic.h +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef CORE_HEADER_HEADER_MAGIC_H_ -#define CORE_HEADER_HEADER_MAGIC_H_ - -#include -#include -#include - -#include "io/basic_buffer.h" -#include "support/magic_constants.h" - -namespace tachyon{ -namespace core{ - -struct HeaderMagic{ -public: - typedef HeaderMagic self_type; - -public: - HeaderMagic(); - HeaderMagic(const self_type& other); - ~HeaderMagic() = default; - - inline const U64& getNumberSamples(void) const{ return(this->n_samples); } - inline U64& getNumberSamples(void){ return(this->n_samples); } - inline const U32& getNumberContigs(void) const{ return(this->n_contigs); } - inline U32& getNumberContigs(void){ return(this->n_contigs); } - inline const U16& getController(void) const{ return(this->controller); } - inline U16& getController(void){ return(this->controller); } - - inline bool validate(void) const{ - return(this->n_contigs > 0 && (this->major_version > 0 || this->minor_version > 0)); - } - - inline const bool operator!=(const self_type& other) const{ return(!(*this == other)); } - inline const bool operator==(const self_type& other) const{ - if(this->n_samples != other.n_samples) return false; - if(this->n_contigs != other.n_contigs) return false; - return true; - } - -private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& header){ - //stream.write(header.magic_string, tachyon::constants::FILE_HEADER_LENGTH); - stream.write(reinterpret_cast(&tachyon::constants::TACHYON_VERSION_MAJOR), sizeof(S32)); - stream.write(reinterpret_cast(&tachyon::constants::TACHYON_VERSION_MINOR), sizeof(S32)); - stream.write(reinterpret_cast(&tachyon::constants::TACHYON_VERSION_PATCH), sizeof(S32)); - stream.write(reinterpret_cast(&header.controller), sizeof(U16)); - stream.write(reinterpret_cast(&header.n_samples), sizeof(U64)); - stream.write(reinterpret_cast(&header.n_contigs), sizeof(U32)); - stream.write(reinterpret_cast(&header.n_info_values), sizeof(U32)); - stream.write(reinterpret_cast(&header.n_format_values), sizeof(U32)); - stream.write(reinterpret_cast(&header.n_filter_values), sizeof(U32)); - stream.write(reinterpret_cast(&header.l_literals), sizeof(U32)); - stream.write(reinterpret_cast(&header.l_header), sizeof(U32)); - stream.write(reinterpret_cast(&header.l_header_uncompressed), sizeof(U32)); - return stream; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - //stream.read(header.magic_string, tachyon::constants::FILE_HEADER_LENGTH); - stream.read(reinterpret_cast(&header.major_version), sizeof(S32)); - stream.read(reinterpret_cast(&header.minor_version), sizeof(S32)); - stream.read(reinterpret_cast(&header.patch_version), sizeof(S32)); - stream.read(reinterpret_cast(&header.controller), sizeof(U16)); - stream.read(reinterpret_cast(&header.n_samples), sizeof(U64)); - stream.read(reinterpret_cast(&header.n_contigs), sizeof(U32)); - stream.read(reinterpret_cast(&header.n_info_values), sizeof(U32)); - stream.read(reinterpret_cast(&header.n_format_values), sizeof(U32)); - stream.read(reinterpret_cast(&header.n_filter_values), sizeof(U32)); - stream.read(reinterpret_cast(&header.l_literals), sizeof(U32)); - stream.read(reinterpret_cast(&header.l_header), sizeof(U32)); - stream.read(reinterpret_cast(&header.l_header_uncompressed), sizeof(U32)); - return(stream); - } - - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& header){ - //buffer.Add(header.magic_string, tachyon::constants::FILE_HEADER_LENGTH); - buffer += header.major_version; - buffer += header.minor_version; - buffer += header.patch_version; - buffer += header.controller; - buffer += header.n_samples; - buffer += header.n_contigs; - buffer += header.n_info_values; - buffer += header.n_format_values; - buffer += header.n_filter_values; - buffer += header.l_literals; - buffer += header.l_header; - buffer += header.l_header_uncompressed; - return(buffer); - } - - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& header){ - //buffer.read(header.magic_string, tachyon::constants::FILE_HEADER_LENGTH); - buffer >> header.major_version; - buffer >> header.minor_version; - buffer >> header.patch_version; - buffer >> header.controller; - buffer >> header.n_samples; - buffer >> header.n_contigs; - buffer >> header.n_info_values; - buffer >> header.n_format_values; - buffer >> header.n_filter_values; - buffer >> header.l_literals; - buffer >> header.l_header; - buffer >> header.l_header_uncompressed; - return(buffer); - } - -public: - //char magic_string[tachyon::constants::FILE_HEADER_LENGTH]; - S32 major_version; - S32 minor_version; - S32 patch_version; - U16 controller; // controller - U64 n_samples; // number of samples - U32 n_contigs; // number of contigs - U32 n_info_values; // number of unique info fields - U32 n_format_values; // number of unique format fields - U32 n_filter_values; // number of unique filter fields - U32 l_literals; // literals length - U32 l_header; // compressed length - U32 l_header_uncompressed; // uncompressed length -}; - -} -} - -#endif /* CORE_HEADER_HEADER_MAGIC_H_ */ diff --git a/lib/core/header/header_map_entry.cpp b/lib/core/header/header_map_entry.cpp deleted file mode 100644 index 20f9327..0000000 --- a/lib/core/header/header_map_entry.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include "header_map_entry.h" - -namespace tachyon{ -namespace core{ - -HeaderMapEntry::HeaderMapEntry() : - IDX(0), - primitive_type(0) -{} - -HeaderMapEntry::HeaderMapEntry(const std::string& id, const S32& idx) : - IDX(idx), - primitive_type(0), - ID(id) -{} - -HeaderMapEntry::HeaderMapEntry(const std::string& id, const S32& idx, const S32& primitive_type) : - IDX(idx), - primitive_type(primitive_type), - ID(id) -{} - -HeaderMapEntry::HeaderMapEntry(const std::string& id) : - IDX(0), - primitive_type(0), - ID(id) -{} - -HeaderMapEntry::HeaderMapEntry(const self_type& other) : - IDX(other.IDX), - primitive_type(other.primitive_type), - ID(other.ID) -{} - -HeaderMapEntry::HeaderMapEntry(self_type&& other) : - IDX(other.IDX), - primitive_type(other.primitive_type), - ID(other.ID) -{} - -HeaderMapEntry& HeaderMapEntry::operator=(const self_type& other){ - this->IDX = other.IDX; - this->primitive_type = other.primitive_type; - this->ID = other.ID; - return(*this); -} - -HeaderMapEntry& HeaderMapEntry::operator=(HeaderMapEntry&& other){ - this->IDX = other.IDX; - this->primitive_type = other.primitive_type; - this->ID = other.ID; - return(*this); -} - -} -} diff --git a/lib/core/header/header_map_entry.h b/lib/core/header/header_map_entry.h deleted file mode 100644 index d543f20..0000000 --- a/lib/core/header/header_map_entry.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef CORE_BASE_HEADERMAPENTRY_H_ -#define CORE_BASE_HEADERMAPENTRY_H_ - -#include - -#include "support/enums.h" -#include "io/basic_buffer.h" -#include "io/vcf/VCFHeaderLine.h" - -namespace tachyon{ -namespace core{ - - -/**< - * FORMAT/FILTER/INFO field entry - */ -struct HeaderMapEntry{ -private: - typedef HeaderMapEntry self_type; - -public: - HeaderMapEntry(); - HeaderMapEntry(const std::string& id, const S32& idx); - HeaderMapEntry(const std::string& id, const S32& idx, const S32& primitive_type); - HeaderMapEntry(const std::string& id); - HeaderMapEntry(const self_type& other); - HeaderMapEntry(self_type&& other); - HeaderMapEntry& operator=(const self_type& other); - HeaderMapEntry& operator=(HeaderMapEntry&& other); - ~HeaderMapEntry() = default; - - inline const bool operator<(const self_type& other) const{ return(this->IDX < other.IDX); } - inline const bool operator>(const self_type& other) const{ return(!this->operator<(other));} - inline const TACHYON_VARIANT_HEADER_FIELD_TYPE getType(void) const{ return(TACHYON_VARIANT_HEADER_FIELD_TYPE(this->primitive_type)); } - -private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - const U32 l_ID = entry.ID.size(); - stream.write(reinterpret_cast(&l_ID), sizeof(U32)); - stream.write(reinterpret_cast(&entry.IDX), sizeof(S32)); - stream.write(reinterpret_cast(&entry.primitive_type), sizeof(BYTE)); - stream.write(&entry.ID[0], entry.ID.size()); - return(stream); - } - - friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry){ - U32 l_ID = 0; - stream.read(reinterpret_cast(&l_ID), sizeof(U32)); - stream.read(reinterpret_cast(&entry.IDX), sizeof(S32)); - stream.read(reinterpret_cast(&entry.primitive_type), sizeof(BYTE)); - entry.ID.resize(l_ID); - stream.read(&entry.ID[0], l_ID); - - return(stream); - } - - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& entry){ - buffer += (U32)entry.ID.size(); - buffer += entry.IDX; - buffer += entry.primitive_type; - buffer.Add(&entry.ID[0], entry.ID.size()); - return(buffer); - } - - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& entry){ - U32 l_ID = 0; - buffer >> l_ID; - buffer >> entry.IDX; - buffer >> entry.primitive_type; - entry.ID.resize(l_ID); - buffer.read(&entry.ID[0], l_ID); - return(buffer); - } - -public: - S32 IDX; - BYTE primitive_type; - std::string ID; -}; - -} -} - -#endif /* CORE_BASE_HEADERMAPENTRY_H_ */ diff --git a/lib/core/header/header_sample.h b/lib/core/header/header_sample.h deleted file mode 100644 index c0eaf7e..0000000 --- a/lib/core/header/header_sample.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef CORE_BASE_HEADERSAMPLE_H_ -#define CORE_BASE_HEADERSAMPLE_H_ - -#include "io/basic_buffer.h" - -namespace tachyon{ -namespace core{ - -struct HeaderSample{ -private: - typedef HeaderSample self_type; - -public: - HeaderSample(void){} - HeaderSample(const std::string& name) : name(name){} - ~HeaderSample(){} - - // Capacity - inline const bool empty(void) const{ return(this->name.size() == 0); } - inline const size_t size(void) const{ return(this->name.size()); } - - // Element access - inline char* data(void){ return(&this->name[0]); } - inline const char* const data(void) const{ return(&this->name[0]); } - -private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - const U32 l_name = entry.name.size(); - stream.write(reinterpret_cast(&l_name), sizeof(U32)); - stream.write(&entry.name[0], entry.name.size()); - return(stream); - } - - friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry){ - U32 l_name = 0; - stream.read(reinterpret_cast(&l_name), sizeof(U32)); - entry.name.resize(l_name); - stream.read(&entry.name[0], l_name); - return(stream); - } - - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& sample){ - buffer += (U32)sample.name.size(); - buffer.Add(&sample.name[0], sample.name.size()); - return(buffer); - } - - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& sample){ - U32 l_name = 0; - buffer >> l_name; - sample.name.resize(l_name); - buffer.read(&sample.name[0], l_name); - return(buffer); - } - -public: - std::string name; -}; - -} -} - - - -#endif /* CORE_BASE_HEADERSAMPLE_H_ */ diff --git a/lib/core/header/variant_header.cpp b/lib/core/header/variant_header.cpp index 4106ceb..4e79ac6 100644 --- a/lib/core/header/variant_header.cpp +++ b/lib/core/header/variant_header.cpp @@ -1,303 +1,387 @@ #include "variant_header.h" namespace tachyon{ -namespace core{ - -VariantHeader::VariantHeader(void) : - contigs(nullptr), - samples(nullptr), - info_fields(nullptr), - format_fields(nullptr), - filter_fields(nullptr), - htable_contigs(nullptr), - htable_samples(nullptr), - htable_info_fields(nullptr), - htable_format_fields(nullptr), - htable_filter_fields(nullptr) -{} - -VariantHeader::VariantHeader(const vcf_header_type& vcf_header) : - contigs(nullptr), - samples(nullptr), - info_fields(nullptr), - format_fields(nullptr), - filter_fields(nullptr), - htable_contigs(nullptr), - htable_samples(nullptr), - htable_info_fields(nullptr), - htable_format_fields(nullptr), - htable_filter_fields(nullptr) -{ - // Invoke copy operator - *this = vcf_header; -} -VariantHeader::VariantHeader(const self_type& other) : - header_magic(other.header_magic), - contigs(new contig_type[this->header_magic.getNumberContigs()]), - samples(new sample_type[this->header_magic.getNumberSamples()]), - info_fields(new map_entry_type[this->header_magic.n_info_values]), - format_fields(new map_entry_type[this->header_magic.n_format_values]), - filter_fields(new map_entry_type[this->header_magic.n_filter_values]), - htable_contigs(nullptr), - htable_samples(nullptr), - htable_info_fields(nullptr), - htable_format_fields(nullptr), - htable_filter_fields(nullptr) -{ - for(U32 i = 0; i < this->header_magic.getNumberContigs(); ++i) - this->contigs[i] = other.contigs[i]; - - for(U32 i = 0; i < this->header_magic.getNumberSamples(); ++i) - this->samples[i] = other.samples[i]; - - for(U32 i = 0; i < this->header_magic.n_info_values; ++i){ - this->info_fields[i] = other.info_fields[i]; - } +bool VariantHeader::BuildReverseMaps(void){ + this->contigs_reverse_map_.clear(); + this->info_fields_reverse_map_.clear(); + this->format_fields_reverse_map_.clear(); + this->filter_fields_reverse_map_.clear(); - for(U32 i = 0; i < this->header_magic.n_format_values; ++i){ - this->format_fields[i] = other.format_fields[i]; - } + for(uint32_t i = 0; i < this->contigs_.size(); ++i) this->contigs_reverse_map_[this->contigs_[i].idx] = i; + for(uint32_t i = 0; i < this->info_fields_.size(); ++i) this->info_fields_reverse_map_[this->info_fields_[i].idx] = i; + for(uint32_t i = 0; i < this->format_fields_.size(); ++i) this->format_fields_reverse_map_[this->format_fields_[i].idx] = i; + for(uint32_t i = 0; i < this->filter_fields_.size(); ++i) this->filter_fields_reverse_map_[this->filter_fields_[i].idx] = i; - for(U32 i = 0; i < this->header_magic.n_filter_values; ++i){ - this->filter_fields[i] = other.filter_fields[i]; - } + return true; +} + +bool VariantHeader::BuildMaps(void){ + this->info_fields_map_.clear(); + this->format_fields_map_.clear(); + this->filter_fields_map_.clear(); + this->contigs_map_.clear(); + + for(uint32_t i = 0; i < this->contigs_.size(); ++i) this->contigs_map_[this->contigs_[i].name] = i; + for(uint32_t i = 0; i < this->info_fields_.size(); ++i) this->info_fields_map_[this->info_fields_[i].id] = i; + for(uint32_t i = 0; i < this->format_fields_.size(); ++i) this->format_fields_map_[this->format_fields_[i].id] = i; + for(uint32_t i = 0; i < this->filter_fields_.size(); ++i) this->filter_fields_map_[this->filter_fields_[i].id] = i; + for(uint32_t i = 0; i < this->samples_.size(); ++i) this->samples_map_[this->samples_[i]] = i; - this->buildHashTables(); + return true; } -VariantHeader::~VariantHeader(){ - delete [] this->contigs; - delete [] this->samples; - delete [] this->info_fields; - delete [] this->format_fields; - delete [] this->filter_fields; - delete this->htable_contigs; - delete this->htable_samples; - delete this->htable_info_fields; - delete this->htable_format_fields; - delete this->htable_filter_fields; +bool VariantHeader::RecodeIndices(void){ + for(uint32_t i = 0; i < this->contigs_.size(); ++i) this->contigs_[i].idx = i; + for(uint32_t i = 0; i < this->info_fields_.size(); ++i) this->info_fields_[i].idx = i; + for(uint32_t i = 0; i < this->format_fields_.size(); ++i) this->format_fields_[i].idx = i; + for(uint32_t i = 0; i < this->filter_fields_.size(); ++i) this->filter_fields_[i].idx = i; + + if(this->BuildMaps() == false) return false; + if(this->BuildReverseMaps() == false) return false; + return true; } -const bool VariantHeader::getContig(const std::string& p, contig_type*& target) const{ - if(this->htable_contigs == nullptr) return false; - S32* ret = nullptr; - if(this->htable_contigs->GetItem(&p[0], &p, ret, p.size())){ - target = &this->contigs[*ret]; - return true; +bcf_hdr_t* VariantHeader::ConvertVcfHeaderLiterals(const bool add_format){ + std::string internal = this->literals_; + internal += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if(this->samples_.size() && add_format){ + internal += "\tFORMAT\t"; + internal += this->samples_[0]; + for(size_t i = 1; i < this->samples_.size(); ++i) + internal += "\t" + this->samples_[i]; + } + internal += "\n"; + + hts_vcf_header* hdr = bcf_hdr_init("r"); + int ret = bcf_hdr_parse(hdr, (char*)internal.c_str()); + if(ret != 0){ + std::cerr << "failed to get bcf header from literals" << std::endl; + bcf_hdr_destroy(hdr); + return(nullptr); } - return false; + + return(hdr); } -const bool VariantHeader::getSample(const std::string& p, sample_type*& target) const{ - if(this->htable_samples == nullptr) return false; - S32* ret = nullptr; - if(this->htable_samples->GetItem(&p[0], &p, ret, p.size())){ - target = &this->samples[*ret]; - return true; +bcf_hdr_t* VariantHeader::ConvertVcfHeader(const bool add_format){ + std::string internal = this->ToString(true); + internal += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if(this->samples_.size() && add_format){ + internal += "\tFORMAT\t"; + internal += this->samples_[0]; + for(size_t i = 1; i < this->samples_.size(); ++i) + internal += "\t" + this->samples_[i]; + } + internal += "\n"; + + hts_vcf_header* hdr = bcf_hdr_init("r"); + int ret = bcf_hdr_parse(hdr, (char*)internal.c_str()); + if(ret != 0){ + std::cerr << "failed to get bcf header from literals" << std::endl; + bcf_hdr_destroy(hdr); + return(nullptr); } - return false; + + return(hdr); } -const bool VariantHeader::getInfoField(const std::string& p, map_entry_type*& target) const{ - if(this->htable_info_fields== nullptr) return false; - S32* ret = nullptr; - if(this->htable_info_fields->GetItem(&p[0], &p, ret, p.size())){ - target = &this->info_fields[*ret]; - return true; +void VariantHeader::AddGenotypeAnnotationFields(void){ + //"NM","NPM","AN","HWE_P","AC","AF","AC_P","FS_A","F_PIC","HET","MULTI_ALLELIC" + + const YonInfo* info = this->GetInfo("NM"); + if(info == nullptr){ + YonInfo nm; + nm.id = "NM"; + nm.number = "1"; + nm.type = "Integer"; + nm.yon_type = YON_VCF_HEADER_INTEGER; + nm.description = "NM"; + nm.idx = this->info_fields_.size(); + this->literals_ += nm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(nm); } - return false; -} -const bool VariantHeader::getFormatField(const std::string& p, map_entry_type*& target) const{ - if(this->htable_format_fields == nullptr) return false; - S32* ret = nullptr; - if(this->htable_format_fields->GetItem(&p[0], &p, ret, p.size())){ - target = &this->format_fields[*ret]; - return true; + info = this->GetInfo("NPM"); + if(info == nullptr){ + YonInfo npm; + npm.id = "NPM"; + npm.number = "1"; + npm.type = "Integer"; + npm.yon_type = YON_VCF_HEADER_INTEGER; + npm.description = "NPM"; + npm.idx = this->info_fields_.size(); + this->literals_ += npm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(npm); } - return false; -} -const bool VariantHeader::getFilterField(const std::string& p, map_entry_type*& target) const{ - if(this->htable_filter_fields == nullptr) return false; - S32* ret = nullptr; - if(this->htable_filter_fields->GetItem(&p[0], &p, ret, p.size())){ - target = &this->filter_fields[*ret]; - return true; + info = this->GetInfo("AN"); + if(info == nullptr){ + YonInfo npm; + npm.id = "AN"; + npm.number = "1"; + npm.type = "Integer"; + npm.yon_type = YON_VCF_HEADER_INTEGER; + npm.description = "AN"; + npm.idx = this->info_fields_.size(); + this->literals_ += npm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(npm); } - return false; -} -const core::HeaderMapEntry* VariantHeader::getInfoField(const std::string& p) const{ - if(this->htable_info_fields == nullptr) return nullptr; - S32* ret = nullptr; - if(this->htable_info_fields->GetItem(&p[0], &p, ret, p.size())) - return(&this->info_fields[*ret]); + info = this->GetInfo("HWE_P"); + if(info == nullptr){ + YonInfo npm; + npm.id = "HWE_P"; + npm.number = "1"; + npm.type = "Float"; + npm.yon_type = YON_VCF_HEADER_FLOAT; + npm.description = "HWE_P"; + npm.idx = this->info_fields_.size(); + this->literals_ += npm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(npm); + } - return nullptr; -} + info = this->GetInfo("AC"); + if(info == nullptr){ + YonInfo npm; + npm.id = "AC"; + npm.number = "."; + npm.type = "Integer"; + npm.yon_type = YON_VCF_HEADER_INTEGER; + npm.description = "AC"; + npm.idx = this->info_fields_.size(); + this->literals_ += npm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(npm); + } -const core::HeaderMapEntry* VariantHeader::getFormatField(const std::string& p) const{ - if(this->htable_format_fields == nullptr) return nullptr; - S32* ret = nullptr; - if(this->htable_format_fields->GetItem(&p[0], &p, ret, p.size())) - return(&this->format_fields[*ret]); + info = this->GetInfo("AF"); + if(info == nullptr){ + YonInfo npm; + npm.id = "AF"; + npm.number = "."; + npm.type = "Float"; + npm.yon_type = YON_VCF_HEADER_FLOAT; + npm.description = "AF"; + npm.idx = this->info_fields_.size(); + this->literals_ += npm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(npm); + } - return nullptr; -} + info = this->GetInfo("AC_P"); + if(info == nullptr){ + YonInfo npm; + npm.id = "AC_P"; + npm.number = "."; + npm.type = "Integer"; + npm.yon_type = YON_VCF_HEADER_INTEGER; + npm.description = "AC_P"; + npm.idx = this->info_fields_.size(); + this->literals_ += npm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(npm); + } -const core::HeaderMapEntry* VariantHeader::getFilterField(const std::string& p) const{ - if(this->htable_filter_fields == nullptr) return nullptr; - S32* ret = nullptr; - if(this->htable_filter_fields->GetItem(&p[0], &p, ret, p.size())) - return(&this->filter_fields[*ret]); + info = this->GetInfo("FS_A"); + if(info == nullptr){ + YonInfo npm; + npm.id = "FS_A"; + npm.number = "."; + npm.type = "Float"; + npm.yon_type = YON_VCF_HEADER_FLOAT; + npm.description = "FS_A"; + npm.idx = this->info_fields_.size(); + this->literals_ += npm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(npm); + } - return nullptr; -} + info = this->GetInfo("F_PIC"); + if(info == nullptr){ + YonInfo nm; + nm.id = "F_PIC"; + nm.number = "1"; + nm.type = "Float"; + nm.yon_type = YON_VCF_HEADER_FLOAT; + nm.description = "F_PIC"; + nm.idx = this->info_fields_.size(); + this->literals_ += nm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(nm); + } -bool VariantHeader::buildHashTables(void){ - if(this->header_magic.n_contigs){ - if(this->header_magic.n_contigs*2 < 5012){ - this->htable_contigs = new hash_table_type(5012); - } else - this->htable_contigs = new hash_table_type(this->header_magic.n_contigs*2); + info = this->GetInfo("HET"); + if(info == nullptr){ + YonInfo nm; + nm.id = "HET"; + nm.number = "1"; + nm.type = "Float"; + nm.yon_type = YON_VCF_HEADER_FLOAT; + nm.description = "HET"; + nm.idx = this->info_fields_.size(); + this->literals_ += nm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(nm); + } - for(S32 i = 0; i < this->header_magic.n_contigs; ++i){ - this->htable_contigs->SetItem(&this->contigs[i].name[0], &this->contigs[i].name, i, this->contigs[i].name.size()); - } + info = this->GetInfo("MULTI_ALLELIC"); + if(info == nullptr){ + YonInfo nm; + nm.id = "MULTI_ALLELIC"; + nm.number = "1"; + nm.type = "Flag"; + nm.yon_type = YON_VCF_HEADER_FLAG; + nm.description = "MULTI_ALLELIC"; + nm.idx = this->info_fields_.size(); + this->literals_ += nm.ToVcfString(false) + "\n"; + this->info_fields_.push_back(nm); } - if(this->header_magic.n_samples){ - if(this->header_magic.n_samples*2 < 5012){ - this->htable_samples = new hash_table_type(5012); - } else - this->htable_samples = new hash_table_type(this->header_magic.n_samples*2); + this->BuildMaps(); + this->BuildReverseMaps(); +} - for(S32 i = 0; i < this->header_magic.n_samples; ++i){ - this->htable_samples->SetItem(&this->samples[i].name[0], &this->samples[i].name, i, this->samples[i].name.size()); - } - } +std::ostream& operator<<(std::ostream& stream, const VariantHeader& header){ + utility::SerializeString(header.fileformat_string_, stream); + utility::SerializeString(header.literals_, stream); - if(this->header_magic.n_info_values){ - if(this->header_magic.n_info_values*2 < 5012){ - this->htable_info_fields = new hash_table_type(5012); - } else - this->htable_info_fields = new hash_table_type(this->header_magic.n_info_values*2); + size_t l_helper = header.samples_.size(); + utility::SerializePrimitive(l_helper, stream); + for(U32 i = 0; i < header.samples_.size(); ++i) utility::SerializeString(header.samples_[i], stream); - for(S32 i = 0; i < this->header_magic.n_info_values; ++i){ - this->htable_info_fields->SetItem(&this->info_fields[i].ID[0], &this->info_fields[i].ID, i, this->info_fields[i].ID.size()); - } - } + l_helper = header.contigs_.size(); + utility::SerializePrimitive(l_helper, stream); + for(U32 i = 0; i < header.contigs_.size(); ++i) stream << header.contigs_[i]; - if(this->header_magic.n_format_values){ - if(this->header_magic.n_format_values*2 < 5012){ - this->htable_format_fields = new hash_table_type(5012); - } else - this->htable_format_fields = new hash_table_type(this->header_magic.n_format_values*2); + l_helper = header.info_fields_.size(); + utility::SerializePrimitive(l_helper, stream); + for(U32 i = 0; i < header.info_fields_.size(); ++i) stream << header.info_fields_[i]; - for(S32 i = 0; i < this->header_magic.n_format_values; ++i){ - this->htable_format_fields->SetItem(&this->format_fields[i].ID[0], &this->format_fields[i].ID, i, this->format_fields[i].ID.size()); - } - } + l_helper = header.format_fields_.size(); + utility::SerializePrimitive(l_helper, stream); + for(U32 i = 0; i < header.format_fields_.size(); ++i) stream << header.format_fields_[i]; - if(this->header_magic.n_filter_values){ - if(this->header_magic.n_filter_values*2 < 5012){ - this->htable_filter_fields = new hash_table_type(5012); - } else - this->htable_filter_fields = new hash_table_type(this->header_magic.n_filter_values*2); + l_helper = header.filter_fields_.size(); + utility::SerializePrimitive(l_helper, stream); + for(U32 i = 0; i < header.filter_fields_.size(); ++i) stream << header.filter_fields_[i]; - for(S32 i = 0; i < this->header_magic.n_filter_values; ++i){ - this->htable_filter_fields->SetItem(&this->filter_fields[i].ID[0], &this->filter_fields[i].ID, i, this->filter_fields[i].ID.size()); - } - } + l_helper = header.structured_extra_fields_.size(); + utility::SerializePrimitive(l_helper, stream); + for(U32 i = 0; i < header.structured_extra_fields_.size(); ++i) stream << header.structured_extra_fields_[i]; - return true; + l_helper = header.extra_fields_.size(); + utility::SerializePrimitive(l_helper, stream); + for(U32 i = 0; i < header.extra_fields_.size(); ++i) stream << header.extra_fields_[i]; + + return(stream); } -void VariantHeader::operator=(const vcf_header_type& vcf_header){ - this->header_magic.n_contigs = vcf_header.contigs.size(); - this->header_magic.n_samples = vcf_header.sampleNames.size(); - this->header_magic.n_info_values = vcf_header.info_map.size(); - this->header_magic.n_format_values = vcf_header.format_map.size(); - this->header_magic.n_filter_values = vcf_header.filter_map.size(); - - if(vcf_header.literal_lines.size()){ - this->literals += vcf_header.literal_lines[0]; - for(U32 i = 1; i < vcf_header.literal_lines.size(); ++i) - this->literals += "\n" + vcf_header.literal_lines[i]; - } +std::istream& operator>>(std::istream& stream, VariantHeader& header){ + utility::DeserializeString(header.fileformat_string_, stream); + utility::DeserializeString(header.literals_, stream); - this->header_magic.l_literals = this->literals.size(); + size_t l_helper; + utility::DeserializePrimitive(l_helper, stream); + header.samples_.resize(l_helper); + for(U32 i = 0; i < header.samples_.size(); ++i) utility::DeserializeString(header.samples_[i], stream); - // Cleanup previous - delete [] this->contigs; - delete [] this->samples; - delete [] this->info_fields; - delete [] this->filter_fields; - delete [] this->format_fields; + utility::DeserializePrimitive(l_helper, stream); + header.contigs_.resize(l_helper); + for(U32 i = 0; i < header.contigs_.size(); ++i) stream >> header.contigs_[i]; - this->contigs = new contig_type[this->header_magic.getNumberContigs()]; - for(U32 i = 0; i < this->header_magic.getNumberContigs(); ++i){ - this->contigs[i] = vcf_header.contigs[i]; - this->contigs[i].contigID = i; - } + utility::DeserializePrimitive(l_helper, stream); + header.info_fields_.resize(l_helper); + for(U32 i = 0; i < header.info_fields_.size(); ++i) stream >> header.info_fields_[i]; - this->samples = new sample_type[this->header_magic.getNumberSamples()]; - for(U32 i = 0; i < this->header_magic.getNumberSamples(); ++i) - this->samples[i] = vcf_header.sampleNames[i]; + utility::DeserializePrimitive(l_helper, stream); + header.format_fields_.resize(l_helper); + for(U32 i = 0; i < header.format_fields_.size(); ++i) stream >> header.format_fields_[i]; - this->info_fields = new map_entry_type[this->header_magic.n_info_values]; - for(U32 i = 0; i < this->header_magic.n_info_values; ++i){ - this->info_fields[i] = vcf_header.info_map[i]; - this->info_fields[i].IDX = i; // update idx to local - } + utility::DeserializePrimitive(l_helper, stream); + header.filter_fields_.resize(l_helper); + for(U32 i = 0; i < header.filter_fields_.size(); ++i) stream >> header.filter_fields_[i]; - this->format_fields = new map_entry_type[this->header_magic.n_format_values]; - for(U32 i = 0; i < this->header_magic.n_format_values; ++i){ - this->format_fields[i] = vcf_header.format_map[i]; - this->format_fields[i].IDX = i; // update idx to local - } + utility::DeserializePrimitive(l_helper, stream); + header.structured_extra_fields_.resize(l_helper); + for(U32 i = 0; i < header.structured_extra_fields_.size(); ++i) stream >> header.structured_extra_fields_[i]; - this->filter_fields = new map_entry_type[this->header_magic.n_filter_values]; - for(U32 i = 0; i < this->header_magic.n_filter_values; ++i){ - this->filter_fields[i] = vcf_header.filter_map[i]; - this->filter_fields[i].IDX = i; // update idx to local - } + utility::DeserializePrimitive(l_helper, stream); + header.extra_fields_.resize(l_helper); + for(U32 i = 0; i < header.extra_fields_.size(); ++i) stream >> header.extra_fields_[i]; - this->buildHashTables(); -} + header.BuildMaps(); + header.BuildReverseMaps(); -std::ostream& VariantHeader::writeHeaderVCF(std::ostream& stream, const bool showFormat) const{ - stream << this->literals; - if(this->literals.size()) stream.put('\n'); - if(showFormat){ - stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - if(this->header_magic.n_samples){ - stream.put('\t'); - stream << this->samples[0].name; - for(U32 i = 1; i < this->header_magic.n_samples; ++i){ - stream << "\t" << this->samples[i].name; - } - } - } else { - stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; - } - stream.put('\n'); - return(stream); + return stream; } -std::ostream& VariantHeader::write(std::ostream& stream){ - stream << this->header_magic; - for(U32 i = 0; i < this->header_magic.n_contigs; ++i) stream << this->contigs[i]; - for(U32 i = 0; i < this->header_magic.n_samples; ++i) stream << this->samples[i]; - for(U32 i = 0; i < this->header_magic.n_info_values; ++i) stream << this->info_fields[i]; - for(U32 i = 0; i < this->header_magic.n_format_values; ++i) stream << this->format_fields[i]; - for(U32 i = 0; i < this->header_magic.n_filter_values; ++i) stream << this->filter_fields[i]; +io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const VariantHeader& header){ + io::SerializeString(header.fileformat_string_, buffer); + io::SerializeString(header.literals_, buffer); - stream.write(&this->literals[0], this->literals.size()); - return(stream); + uint32_t l_helper = header.samples_.size(); + io::SerializePrimitive(l_helper, buffer); + for(U32 i = 0; i < header.samples_.size(); ++i) io::SerializeString(header.samples_[i], buffer); + + l_helper = header.contigs_.size(); + io::SerializePrimitive(l_helper, buffer); + for(U32 i = 0; i < header.contigs_.size(); ++i) buffer << header.contigs_[i]; + + l_helper = header.info_fields_.size(); + io::SerializePrimitive(l_helper, buffer); + for(U32 i = 0; i < header.info_fields_.size(); ++i) buffer << header.info_fields_[i]; + + l_helper = header.format_fields_.size(); + io::SerializePrimitive(l_helper, buffer); + for(U32 i = 0; i < header.format_fields_.size(); ++i) buffer << header.format_fields_[i]; + + l_helper = header.filter_fields_.size(); + io::SerializePrimitive(l_helper, buffer); + for(U32 i = 0; i < header.filter_fields_.size(); ++i) buffer << header.filter_fields_[i]; + + l_helper = header.structured_extra_fields_.size(); + io::SerializePrimitive(l_helper, buffer); + for(U32 i = 0; i < header.structured_extra_fields_.size(); ++i) buffer << header.structured_extra_fields_[i]; + + l_helper = header.extra_fields_.size(); + io::SerializePrimitive(l_helper, buffer); + for(U32 i = 0; i < header.extra_fields_.size(); ++i) buffer << header.extra_fields_[i]; + + return(buffer); } +io::BasicBuffer& operator>>(io::BasicBuffer& buffer, VariantHeader& header){ + io::DeserializeString(header.fileformat_string_, buffer); + io::DeserializeString(header.literals_, buffer); + + uint32_t l_helper; + io::DeserializePrimitive(l_helper, buffer); + header.samples_.resize(l_helper); + for(U32 i = 0; i < header.samples_.size(); ++i) io::DeserializeString(header.samples_[i], buffer); + + io::DeserializePrimitive(l_helper, buffer); + header.contigs_.resize(l_helper); + for(U32 i = 0; i < header.contigs_.size(); ++i) buffer >> header.contigs_[i]; + + io::DeserializePrimitive(l_helper, buffer); + header.info_fields_.resize(l_helper); + for(U32 i = 0; i < header.info_fields_.size(); ++i) buffer >> header.info_fields_[i]; + + io::DeserializePrimitive(l_helper, buffer); + header.format_fields_.resize(l_helper); + for(U32 i = 0; i < header.format_fields_.size(); ++i) buffer >> header.format_fields_[i]; + + io::DeserializePrimitive(l_helper, buffer); + header.filter_fields_.resize(l_helper); + for(U32 i = 0; i < header.filter_fields_.size(); ++i) buffer >> header.filter_fields_[i]; + + io::DeserializePrimitive(l_helper, buffer); + header.structured_extra_fields_.resize(l_helper); + for(U32 i = 0; i < header.structured_extra_fields_.size(); ++i) buffer >> header.structured_extra_fields_[i]; + + io::DeserializePrimitive(l_helper, buffer); + header.extra_fields_.resize(l_helper); + for(U32 i = 0; i < header.extra_fields_.size(); ++i) buffer >> header.extra_fields_[i]; + + header.BuildMaps(); + header.BuildReverseMaps(); + + return buffer; } + } diff --git a/lib/core/header/variant_header.h b/lib/core/header/variant_header.h index fb08285..d5fa79e 100644 --- a/lib/core/header/variant_header.h +++ b/lib/core/header/variant_header.h @@ -1,183 +1,491 @@ #ifndef CORE_BASE_HEADER_YON_TACHYONHEADER_H_ #define CORE_BASE_HEADER_YON_TACHYONHEADER_H_ -#include "header_contig.h" -#include "header_magic.h" -#include "header_map_entry.h" -#include "header_sample.h" +#include "support/enums.h" #include "support/type_definitions.h" -#include "support/magic_constants.h" +#include "support/helpers.h" #include "algorithm/OpenHashTable.h" -#include "io/vcf/VCFHeader.h" #include "io/basic_buffer.h" +#include "io/vcf_utils.h" +#include + namespace tachyon{ -namespace core{ -/**< - * This class describes data that is mandatory in the Tachyon - * file-format - */ -class VariantHeader{ -private: - typedef VariantHeader self_type; - typedef HeaderMagic magic_type; - typedef core::HeaderContig contig_type; - typedef core::HeaderMapEntry map_entry_type; - typedef core::HeaderSample sample_type; - typedef hash::HashTable hash_table_type; - typedef vcf::VCFHeader vcf_header_type; +struct YonContig : public io::VcfContig { +public: + YonContig() : n_blocks(0){} + YonContig(const io::VcfContig& vcf_contig) : io::VcfContig(vcf_contig), n_blocks(0){} + ~YonContig() = default; + + std::string ToVcfString(const bool is_bcf = false) const{ return(io::VcfContig::ToVcfString(is_bcf)); } + std::string ToVcfString(const uint32_t idx) const{ return(io::VcfContig::ToVcfString(idx)); } + + inline void operator++(void){ ++this->n_blocks; } + inline void operator--(void){ --this->n_blocks; } + template inline void operator+=(const T value){ this->n_blocks += value; } + template inline void operator-=(const T value){ this->n_blocks -= value; } + + friend std::ostream& operator<<(std::ostream& stream, const YonContig& contig){ + utility::SerializePrimitive(contig.idx, stream); + utility::SerializePrimitive(contig.n_bases, stream); + utility::SerializePrimitive(contig.n_blocks, stream); + utility::SerializeString(contig.name, stream); + utility::SerializeString(contig.description, stream); + + size_t size_helper = contig.extra.size(); + utility::SerializePrimitive(size_helper, stream); + for(U32 i = 0; i < contig.extra.size(); ++i){ + utility::SerializeString(contig.extra[i].first, stream); + utility::SerializeString(contig.extra[i].second, stream); + } + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, YonContig& contig){ + utility::DeserializePrimitive(contig.idx, stream); + utility::DeserializePrimitive(contig.n_bases, stream); + utility::DeserializePrimitive(contig.n_blocks, stream); + utility::DeserializeString(contig.name, stream); + utility::DeserializeString(contig.description, stream); + + size_t l_extra; + utility::DeserializePrimitive(l_extra, stream); + contig.extra.resize(l_extra); + for(U32 i = 0; i < contig.extra.size(); ++i){ + utility::DeserializeString(contig.extra[i].first, stream); + utility::DeserializeString(contig.extra[i].second, stream); + } + return(stream); + } + + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const YonContig& contig){ + io::SerializePrimitive(contig.idx, buffer); + io::SerializePrimitive(contig.n_bases, buffer); + io::SerializePrimitive(contig.n_blocks, buffer); + io::SerializeString(contig.name, buffer); + io::SerializeString(contig.description, buffer); + + size_t size_helper = contig.extra.size(); + io::SerializePrimitive(size_helper, buffer); + for(U32 i = 0; i < contig.extra.size(); ++i){ + io::SerializeString(contig.extra[i].first, buffer); + io::SerializeString(contig.extra[i].second, buffer); + } + return(buffer); + } + + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, YonContig& contig){ + io::DeserializePrimitive(contig.idx, buffer); + io::DeserializePrimitive(contig.n_bases, buffer); + io::DeserializePrimitive(contig.n_blocks, buffer); + io::DeserializeString(contig.name, buffer); + io::DeserializeString(contig.description, buffer); + + size_t l_extra; + io::DeserializePrimitive(l_extra, buffer); + contig.extra.resize(l_extra); + for(U32 i = 0; i < contig.extra.size(); ++i){ + io::DeserializeString(contig.extra[i].first, buffer); + io::DeserializeString(contig.extra[i].second, buffer); + } + return(buffer); + } public: - explicit VariantHeader(void); - VariantHeader(const vcf_header_type& vcf_header); - VariantHeader(const self_type& other); - ~VariantHeader(); - - inline const contig_type& getContig(const U32& position) const{ return(this->contigs[position]); } - inline const sample_type& getSample(const U32& position) const{ return(this->samples[position]); } - - const bool getContig(const std::string& p, contig_type*& target) const; - const bool getSample(const std::string& p, sample_type*& target) const; - const bool getInfoField(const std::string& p, map_entry_type*& target) const; - const bool getFormatField(const std::string& p, map_entry_type*& target) const; - const bool getFilterField(const std::string& p, map_entry_type*& target) const; - const map_entry_type* getInfoField(const std::string& p) const; - const map_entry_type* getFormatField(const std::string& p) const; - const map_entry_type* getFilterField(const std::string& p) const; - - inline const U64& getSampleNumber(void) const{ return(this->header_magic.n_samples); } - inline U64& getSampleNumber(void){ return(this->header_magic.n_samples); } - inline const U32& getContigNumber(void) const{ return(this->header_magic.n_contigs); } - inline U32& getContigNumber(void){ return(this->header_magic.n_contigs); } + // Number of Tachyon blocks associated with this contig + uint32_t n_blocks; +}; - /**< - * Interconvert a VCF header (provided during import) to - * a Tachyon header - * @param vcf_header Target input VCF header - */ - void operator=(const vcf_header_type& vcf_header); +class YonInfo : public io::VcfInfo { +public: + YonInfo() : yon_type(YON_VCF_HEADER_FLAG){} + YonInfo(const io::VcfInfo& vcf_info) : io::VcfInfo(vcf_info), yon_type(YON_VCF_HEADER_FLAG){ + this->EvaluateType(); + } + ~YonInfo() = default; + + std::string ToVcfString(const bool is_bcf = false) const{ return(io::VcfInfo::ToVcfString(is_bcf)); } + std::string ToVcfString(const uint32_t idx) const{ return(io::VcfInfo::ToVcfString(idx)); } + + bool EvaluateType(void){ + if(this->type == "Integer") this->yon_type = YON_VCF_HEADER_INTEGER; + else if(this->type == "Float") this->yon_type = YON_VCF_HEADER_FLOAT; + else if(this->type == "Flag") this->yon_type = YON_VCF_HEADER_FLAG; + else if(this->type == "Character") this->yon_type = YON_VCF_HEADER_CHARACTER; + else if(this->type == "String") this->yon_type = YON_VCF_HEADER_STRING; + else { + std::cerr << "Illegal header type: " << this->type << std::endl; + return false; + } + return true; + } - // write - std::ostream& write(std::ostream& stream); + friend std::ostream& operator<<(std::ostream& stream, const YonInfo& info){ + utility::SerializePrimitive(info.idx, stream); + utility::SerializeString(info.id, stream); + utility::SerializeString(info.number, stream); + utility::SerializeString(info.type, stream); + utility::SerializeString(info.description, stream); + utility::SerializeString(info.source, stream); + utility::SerializeString(info.version, stream); - const bool has_format_field(const std::string& field_name) const{ - map_entry_type* match = nullptr; - if(this->getFormatField(field_name, match)) - return true; + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, YonInfo& info){ + utility::DeserializePrimitive(info.idx, stream); + utility::DeserializeString(info.id, stream); + utility::DeserializeString(info.number, stream); + utility::DeserializeString(info.type, stream); + utility::DeserializeString(info.description, stream); + utility::DeserializeString(info.source, stream); + utility::DeserializeString(info.version, stream); + info.EvaluateType(); + + return(stream); + } + + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const YonInfo& info){ + io::SerializePrimitive(info.idx, buffer); + io::SerializeString(info.id, buffer); + io::SerializeString(info.number, buffer); + io::SerializeString(info.type, buffer); + io::SerializeString(info.description, buffer); + io::SerializeString(info.source, buffer); + io::SerializeString(info.version, buffer); + + return(buffer); + } + + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, YonInfo& info){ + io::DeserializePrimitive(info.idx, buffer); + io::DeserializeString(info.id, buffer); + io::DeserializeString(info.number, buffer); + io::DeserializeString(info.type, buffer); + io::DeserializeString(info.description, buffer); + io::DeserializeString(info.source, buffer); + io::DeserializeString(info.version, buffer); + info.EvaluateType(); - return false; + return(buffer); } - const bool has_info_field(const std::string& field_name) const{ - map_entry_type* match = nullptr; - if(this->getInfoField(field_name, match)) - return true; +public: + TACHYON_VARIANT_HEADER_FIELD_TYPE yon_type; +}; - return false; +class YonFormat : public io::VcfFormat { +public: + YonFormat() : yon_type(YON_VCF_HEADER_FLAG){} + YonFormat(const io::VcfFormat& vcf_format) : io::VcfFormat(vcf_format), yon_type(YON_VCF_HEADER_FLAG){ + this->EvaluateType(); + } + ~YonFormat() = default; + + std::string ToVcfString(const bool is_bcf = false) const{ return(io::VcfFormat::ToVcfString(is_bcf)); } + std::string ToVcfString(const uint32_t idx) const{ return(io::VcfFormat::ToVcfString(idx)); } + + bool EvaluateType(void){ + if(this->type == "Integer") this->yon_type = YON_VCF_HEADER_INTEGER; + else if(this->type == "Float") this->yon_type = YON_VCF_HEADER_FLOAT; + else if(this->type == "Character") this->yon_type = YON_VCF_HEADER_CHARACTER; + else if(this->type == "String") this->yon_type = YON_VCF_HEADER_STRING; + else { + std::cerr << "Illegal header type: " << this->type << std::endl; + return false; + } + return true; } - const bool has_filter_field(const std::string& field_name) const{ - map_entry_type* match = nullptr; - if(this->getFilterField(field_name, match)) - return true; + friend std::ostream& operator<<(std::ostream& stream, const YonFormat& fmt){ + utility::SerializePrimitive(fmt.idx, stream); + utility::SerializeString(fmt.id, stream); + utility::SerializeString(fmt.number, stream); + utility::SerializeString(fmt.type, stream); + utility::SerializeString(fmt.description, stream); - return(false); + return(stream); } - std::ostream& writeHeaderVCF(std::ostream& stream, const bool showFormat = true) const; + friend std::istream& operator>>(std::istream& stream, YonFormat& fmt){ + utility::DeserializePrimitive(fmt.idx, stream); + utility::DeserializeString(fmt.id, stream); + utility::DeserializeString(fmt.number, stream); + utility::DeserializeString(fmt.type, stream); + utility::DeserializeString(fmt.description, stream); + fmt.EvaluateType(); -private: - bool buildHashTables(void); + return(stream); + } - friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const self_type& header){ - buffer << header.header_magic; - for(U32 i = 0; i < header.header_magic.n_contigs; ++i) buffer << header.contigs[i]; - for(U32 i = 0; i < header.header_magic.n_samples; ++i) buffer << header.samples[i]; - for(U32 i = 0; i < header.header_magic.n_info_values; ++i) buffer << header.info_fields[i]; - for(U32 i = 0; i < header.header_magic.n_format_values; ++i) buffer << header.format_fields[i]; - for(U32 i = 0; i < header.header_magic.n_filter_values; ++i) buffer << header.filter_fields[i]; + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const YonFormat& fmt){ + io::SerializePrimitive(fmt.idx, buffer); + io::SerializeString(fmt.id, buffer); + io::SerializeString(fmt.number, buffer); + io::SerializeString(fmt.type, buffer); + io::SerializeString(fmt.description, buffer); - buffer.Add(&header.literals[0], header.literals.size()); return(buffer); } - friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, self_type& header){ - buffer >> header.header_magic; - delete [] header.contigs; - delete [] header.samples; - delete [] header.info_fields; - delete [] header.format_fields; - delete [] header.filter_fields; - header.contigs = new contig_type[header.header_magic.n_contigs]; - header.samples = new sample_type[header.header_magic.n_samples]; - header.info_fields = new map_entry_type[header.header_magic.n_info_values]; - header.format_fields = new map_entry_type[header.header_magic.n_format_values]; - header.filter_fields = new map_entry_type[header.header_magic.n_filter_values]; - for(U32 i = 0; i < header.header_magic.n_contigs; ++i) buffer >> header.contigs[i]; - for(U32 i = 0; i < header.header_magic.n_samples; ++i) buffer >> header.samples[i]; - for(U32 i = 0; i < header.header_magic.n_info_values; ++i) buffer >> header.info_fields[i]; - for(U32 i = 0; i < header.header_magic.n_format_values; ++i) buffer >> header.format_fields[i]; - for(U32 i = 0; i < header.header_magic.n_filter_values; ++i) buffer >> header.filter_fields[i]; - - header.literals.resize(header.header_magic.l_literals); - buffer.read(&header.literals[0], header.header_magic.l_literals); - - header.buildHashTables(); + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, YonFormat& fmt){ + io::DeserializePrimitive(fmt.idx, buffer); + io::DeserializeString(fmt.id, buffer); + io::DeserializeString(fmt.number, buffer); + io::DeserializeString(fmt.type, buffer); + io::DeserializeString(fmt.description, buffer); + fmt.EvaluateType(); return(buffer); } - friend std::ifstream& operator>>(std::ifstream& stream, self_type& entry){ - stream >> entry.header_magic; +public: + TACHYON_VARIANT_HEADER_FIELD_TYPE yon_type; +}; - delete [] entry.contigs; - entry.contigs = new contig_type[entry.header_magic.n_contigs]; - for(U32 i = 0; i < entry.header_magic.n_contigs; ++i) stream >> entry.contigs[i]; +class VariantHeader{ +public: + typedef VariantHeader self_type; + typedef bcf_hdr_t hts_vcf_header; + typedef std::unordered_map map_type; + typedef std::unordered_map map_reverse_type; + +public: + VariantHeader() = default; + VariantHeader(const VariantHeader& other) : + fileformat_string_(other.fileformat_string_), + literals_(other.literals_), + samples_(other.samples_), + contigs_(other.contigs_), + info_fields_(other.info_fields_), + format_fields_(other.format_fields_), + filter_fields_(other.filter_fields_), + structured_extra_fields_(other.structured_extra_fields_), + extra_fields_(other.extra_fields_) + { + this->BuildMaps(); + this->BuildReverseMaps(); + } + + VariantHeader(const io::VcfHeader& other) : + fileformat_string_(other.fileformat_string_), + literals_(other.literals_), + samples_(other.samples_), + filter_fields_(other.filter_fields_), + structured_extra_fields_(other.structured_extra_fields_), + extra_fields_(other.extra_fields_) + { + this->BuildMaps(); + this->BuildReverseMaps(); + + this->contigs_.resize(other.contigs_.size()); + for(U32 i = 0; i < other.contigs_.size(); ++i) + this->contigs_[i] = other.contigs_[i]; + + this->info_fields_.resize(other.info_fields_.size()); + for(U32 i = 0; i < other.info_fields_.size(); ++i) + this->info_fields_[i] = other.info_fields_[i]; + + this->format_fields_.resize(other.format_fields_.size()); + for(U32 i = 0; i < other.format_fields_.size(); ++i) + this->format_fields_[i] = other.format_fields_[i]; + + this->RecodeIndices(); + + } + + ~VariantHeader() = default; + + inline size_t GetNumberSamples(void) const{ return(this->samples_.size()); } + inline size_t GetNumberContigs(void) const{ return(this->contigs_.size()); } + + void AddSample(const std::string& sample_name) { + if(this->samples_.size() == 0){ + this->samples_.push_back(sample_name); + this->samples_map_[sample_name] = 0; + return; + } + + if(this->samples_map_.find(sample_name) == this->samples_map_.end()){ + this->samples_map_[sample_name] = this->samples_.size(); + this->samples_.push_back(sample_name); + } else { + std::cerr << "illegal: duplicated sample name: " << sample_name << std::endl; + } + } + + const YonContig* GetContig(const std::string& name) const { + map_type::const_iterator it = this->contigs_map_.find(name); + if(it != this->contigs_map_.end()) return(&this->contigs_[it->second]); + return(nullptr); + } + + const YonContig* GetContig(const int& idx) const { + map_reverse_type::const_iterator it = this->contigs_reverse_map_.find(idx); + if(it != this->contigs_reverse_map_.end()) return(&this->contigs_[it->second]); + return(nullptr); + } + + const YonInfo* GetInfo(const std::string& name) const { + map_type::const_iterator it = this->info_fields_map_.find(name); + if(it != this->info_fields_map_.end()) return(&this->info_fields_[it->second]); + return(nullptr); + } + + const YonInfo* GetInfo(const int& idx) const { + map_reverse_type::const_iterator it = this->info_fields_reverse_map_.find(idx); + if(it != this->info_fields_reverse_map_.end()) return(&this->info_fields_[it->second]); + return(nullptr); + } - delete [] entry.samples; - entry.samples = new sample_type[entry.header_magic.n_samples]; - for(U32 i = 0; i < entry.header_magic.n_samples; ++i) stream >> entry.samples[i]; + const YonFormat* GetFormat(const std::string& name) const { + map_type::const_iterator it = this->format_fields_map_.find(name); + if(it != this->format_fields_map_.end()) return(&this->format_fields_[it->second]); + return(nullptr); + } + + const YonFormat* GetFormat(const int& idx) const { + map_reverse_type::const_iterator it = this->format_fields_reverse_map_.find(idx); + if(it != this->format_fields_reverse_map_.end()) return(&this->format_fields_[it->second]); + return(nullptr); + } + + const io::VcfFilter* GetFilter(const std::string& name) const { + map_type::const_iterator it = this->filter_fields_map_.find(name); + if(it != this->filter_fields_map_.end()) return(&this->filter_fields_[it->second]); + return(nullptr); + } - delete [] entry.info_fields; - entry.info_fields = new map_entry_type[entry.header_magic.n_info_values]; - for(U32 i = 0; i < entry.header_magic.n_info_values; ++i) stream >> entry.info_fields[i]; + const io::VcfFilter* GetFilter(const int& idx) const { + map_reverse_type::const_iterator it = this->filter_fields_reverse_map_.find(idx); + if(it != this->filter_fields_reverse_map_.end()) return(&this->filter_fields_[it->second]); + return(nullptr); + } - delete [] entry.format_fields; - entry.format_fields = new map_entry_type[entry.header_magic.n_format_values]; - for(U32 i = 0; i < entry.header_magic.n_format_values; ++i) stream >> entry.format_fields[i]; + const std::string* GetSample(const std::string& name) const { + map_type::const_iterator it = this->samples_map_.find(name); + if(it != this->samples_map_.end()) return(&this->samples_[it->second]); + return(nullptr); + } - delete entry.filter_fields; - entry.filter_fields = new map_entry_type[entry.header_magic.n_filter_values]; - for(U32 i = 0; i < entry.header_magic.n_filter_values; ++i) stream >> entry.filter_fields[i]; + const int32_t GetSampleId(const std::string& name) const { + map_type::const_iterator it = this->samples_map_.find(name); + if(it != this->samples_map_.end()) return(it->second); + return(-1); + } - entry.literals.resize(entry.header_magic.l_literals); - stream.read(&entry.literals[0], entry.header_magic.l_literals); + bool BuildReverseMaps(void); + bool BuildMaps(void); - entry.buildHashTables(); + /**< + * Recodes the internal IDX field for contig info, INFO, FORMAT, and FILTER + * from any range to the range [0, 1, ..., n-1] as desired in Tachyon. + * @return Returns TRUE upon success or FALSE otherwise. + */ + bool RecodeIndices(void); + /**< + * Converts this header object into a hts_vcf_header object from the + * internally stored literal string. This object is required for + * writing out VCF/BCF files. + * @return + */ + hts_vcf_header* ConvertVcfHeaderLiterals(const bool add_format = true); + hts_vcf_header* ConvertVcfHeader(const bool add_format = true); + + void AddGenotypeAnnotationFields(void); + + // Append a string to the literal string + inline void AppendLiteralString(const std::string& literal_addition){ this->literals_ += literal_addition; } + + // Print the literals and the column header. + std::ostream& PrintVcfHeader(std::ostream& stream) const{ + stream << this->literals_; + stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if(this->samples_.size()){ + stream << "\tFORMAT\t"; + stream << this->samples_[0]; + for(size_t i = 1; i < this->samples_.size(); ++i) + stream << "\t" + this->samples_[i]; + } + stream << "\n"; return(stream); } + std::string ToString(const bool is_bcf = false) const{ + std::string string = "##fileformat=VCFv4.1\n"; + uint32_t idx = 0; + for(U32 i = 0; i < this->contigs_.size(); ++i) string += this->contigs_[i].ToVcfString(is_bcf) + "\n"; + for(U32 i = 0; i < this->structured_extra_fields_.size(); ++i) string += this->structured_extra_fields_[i].ToVcfString() + "\n"; + for(U32 i = 0; i < this->filter_fields_.size(); ++i) string += this->filter_fields_[i].ToVcfString(idx++) + "\n"; + for(U32 i = 0; i < this->info_fields_.size(); ++i) string += this->info_fields_[i].ToVcfString(idx++) + "\n"; + for(U32 i = 0; i < this->format_fields_.size(); ++i) string += this->format_fields_[i].ToVcfString(idx++) + "\n"; + for(U32 i = 0; i < this->extra_fields_.size(); ++i) string += this->extra_fields_[i].ToVcfString() + "\n"; + return(string); + } + + friend std::ostream& operator<<(std::ostream& stream, const VariantHeader& header); + friend std::istream& operator>>(std::istream& stream, VariantHeader& header); + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const VariantHeader& header); + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, VariantHeader& header); + public: - magic_type header_magic; - std::string literals; - contig_type* contigs; - sample_type* samples; - map_entry_type* info_fields; - map_entry_type* format_fields; - map_entry_type* filter_fields; - - // Constructed during run-time - hash_table_type* htable_contigs; // hash table for contig names - hash_table_type* htable_samples; // hash table for sample names - hash_table_type* htable_info_fields; // hash map from name to identifier - hash_table_type* htable_format_fields; // hash map from name to identifier - hash_table_type* htable_filter_fields; // hash map from name to identifier + // VCF file version string. + std::string fileformat_string_; + // Literal string for VcfHeader data. Contains all of the Vcf header data up + // to the start of the main header line ("#CHROM"...). As such, sample names + // are not available in this string and needs to be appended before converting + // back into a htslib vcf header. + std::string literals_; + + // Vcf header lines parse into: + // Samples: Individual sample names. + // VcfContig: Information relating to the interpretation of a contig. Data + // include its name, length in bases, its internal index identifier + // and optional additional information. + // VcfInfo: Data specifying a given INFO field + // VcfFormat: Data specifying a given FORMAT field + // VcfFilter: Data specifying a given FILTER field + // VcfStructuredExtra: + std::vector samples_; + std::vector contigs_; + std::vector info_fields_; + std::vector format_fields_; + std::vector filter_fields_; + std::vector structured_extra_fields_; + std::vector extra_fields_; + + // Utility members + // + // Hash tables allowing the mapping from the unique identifier string + // (such as contig name) to the relative index offset of that object. + // This approach requires another layer of indirection when mapping + // from the index to the actual target. For example: + // + // contigs_[contigs_map_["chr20"].second] <- maps to the actual target + // + // The reverse maps allows the mapping from a unique IDX identifier + // to the relative index offset of that object. As above, this requires + // an addition indirect lookup to access the desired object. For example + // mapping the first occuring FORMAT field to its name: + // + // format_fields_[format_fields_reverse_map_[container.at(0)->d.fmt[0].id].second].id + // + // map_type hash tables permits mapping string name -> index offset + // map_reverse_type hash tables permits mapping integer IDX -> index offset + map_type samples_map_; + map_type contigs_map_; + map_type info_fields_map_; + map_type format_fields_map_; + map_type filter_fields_map_; + map_reverse_type contigs_reverse_map_; // map IDX -> index offset + map_reverse_type info_fields_reverse_map_; // map IDX -> index offset + map_reverse_type format_fields_reverse_map_; // map IDX -> index offset + map_reverse_type filter_fields_reverse_map_; // map IDX -> index offset }; } -} diff --git a/lib/core/meta_allele.cpp b/lib/core/meta_allele.cpp index c5cab9b..269e5ef 100644 --- a/lib/core/meta_allele.cpp +++ b/lib/core/meta_allele.cpp @@ -79,5 +79,12 @@ void MetaAllele::operator()(const char* const in, const U32 length){ memcpy(this->allele, &in[0], this->l_allele); } +void MetaAllele::operator()(const std::string& in){ + this->l_allele = in.size(); + delete [] this->allele; + this->allele = new char[this->l_allele]; + memcpy(this->allele, in.data(), this->l_allele); +} + } } diff --git a/lib/core/meta_allele.h b/lib/core/meta_allele.h index ba14cde..bc4bdcf 100644 --- a/lib/core/meta_allele.h +++ b/lib/core/meta_allele.h @@ -30,6 +30,7 @@ struct MetaAllele{ void operator()(const char* const in); void operator()(const char* const in, const U32 length); + void operator()(const std::string& in); inline const U16& size(void) const{ return(this->l_allele); } inline const U16& length(void) const{ return(this->l_allele); } diff --git a/lib/core/meta_entry.cpp b/lib/core/meta_entry.cpp index 31c2b9e..3e13fbe 100644 --- a/lib/core/meta_entry.cpp +++ b/lib/core/meta_entry.cpp @@ -4,6 +4,7 @@ namespace tachyon{ namespace core{ MetaEntry::MetaEntry() : + n_base_ploidy(0), n_alleles(0), info_pattern_id(-1), filter_pattern_id(-1), @@ -14,15 +15,17 @@ MetaEntry::MetaEntry() : alleles(nullptr) {} -MetaEntry::MetaEntry(const bcf_entry_type& bcf_entry) : - n_alleles(bcf_entry.body->n_allele), +MetaEntry::MetaEntry(const bcf1_t* record) : + n_base_ploidy(0), + n_alleles(record->n_allele), + //n_alleles(0), info_pattern_id(-1), filter_pattern_id(-1), format_pattern_id(-1), - quality(bcf_entry.body->QUAL), - contigID(bcf_entry.body->CHROM), - position(bcf_entry.body->POS), - name(bcf_entry.ID, bcf_entry.l_ID), + quality(record->qual), + contigID(record->rid), + position(record->pos), + name(record->d.id), alleles(static_cast(::operator new[](this->n_alleles*sizeof(allele_type)))) { // Fix for the special case when ALT is not encoded @@ -32,7 +35,7 @@ MetaEntry::MetaEntry(const bcf_entry_type& bcf_entry) : this->alleles = static_cast(::operator new[](this->n_alleles*sizeof(allele_type))); new( &this->alleles[0] ) allele_type( ); - this->alleles[0](bcf_entry.alleles[0].data, bcf_entry.alleles[0].length); + this->alleles[0](std::string(record->d.allele[0])); new( &this->alleles[1] ) allele_type( ); this->alleles[1].allele = new char[1]; this->alleles[1].allele[0] = '.'; @@ -40,26 +43,27 @@ MetaEntry::MetaEntry(const bcf_entry_type& bcf_entry) : } else { for(U32 i = 0; i < this->n_alleles; ++i){ new( &this->alleles[i] ) allele_type( ); - this->alleles[i](bcf_entry.alleles[i].data, bcf_entry.alleles[i].length); + this->alleles[i](std::string(record->d.allele[i])); } } if(this->n_alleles == 2) this->controller.biallelic = true; - this->controller.simple_snv = bcf_entry.isSimple(); - if(this->isBiallelicSNV()) + this->controller.simple_snv = (this->alleles[0].length() == 1 && this->alleles[1].length() == 1); + if(this->IsBiallelicSNV()) this->controller.simple_snv = true; } -MetaEntry::MetaEntry(const bcf_entry_type& bcf_entry, const U64 position_offset) : - n_alleles(bcf_entry.body->n_allele), +MetaEntry::MetaEntry(const bcf1_t* record, const U64 position_offset) : + n_base_ploidy(0), + n_alleles(record->n_allele), //n_alleles(0), info_pattern_id(-1), filter_pattern_id(-1), format_pattern_id(-1), - quality(bcf_entry.body->QUAL), - contigID(bcf_entry.body->CHROM), - position(bcf_entry.body->POS - position_offset), - name(bcf_entry.ID, bcf_entry.l_ID), + quality(record->qual), + contigID(record->rid), + position(record->pos - position_offset), + name(record->d.id), alleles(static_cast(::operator new[](this->n_alleles*sizeof(allele_type)))) { // Fix for the special case when ALT is not encoded @@ -69,7 +73,7 @@ MetaEntry::MetaEntry(const bcf_entry_type& bcf_entry, const U64 position_offset) this->alleles = static_cast(::operator new[](this->n_alleles*sizeof(allele_type))); new( &this->alleles[0] ) allele_type( ); - this->alleles[0](bcf_entry.alleles[0].data, bcf_entry.alleles[0].length); + this->alleles[0](std::string(record->d.allele[0])); new( &this->alleles[1] ) allele_type( ); this->alleles[1].allele = new char[1]; this->alleles[1].allele[0] = '.'; @@ -77,17 +81,18 @@ MetaEntry::MetaEntry(const bcf_entry_type& bcf_entry, const U64 position_offset) } else { for(U32 i = 0; i < this->n_alleles; ++i){ new( &this->alleles[i] ) allele_type( ); - this->alleles[i](bcf_entry.alleles[i].data, bcf_entry.alleles[i].length); + this->alleles[i](std::string(record->d.allele[i])); } } if(this->n_alleles == 2) this->controller.biallelic = true; - this->controller.simple_snv = bcf_entry.isSimple(); - if(this->isBiallelicSNV()) + this->controller.simple_snv = (this->alleles[0].length() == 1 && this->alleles[1].length() == 1); + if(this->IsBiallelicSNV()) this->controller.simple_snv = true; } MetaEntry::MetaEntry(const self_type& other) : + n_base_ploidy(other.n_base_ploidy), controller(other.controller), n_alleles(other.n_alleles), info_pattern_id(other.info_pattern_id), @@ -109,8 +114,8 @@ MetaEntry::~MetaEntry(){ ::operator delete[](static_cast(this->alleles)); }; -const bool MetaEntry::usePackedRefAlt(void) const{ - if(this->isBiallelic() == false || this->isDiploid() == false) +bool MetaEntry::UsePackedRefAlt(void) const{ + if(this->IsBiallelic() == false || this->IsDiploid() == false) return false; if(std::regex_match(std::string(this->alleles[0].allele, this->alleles[0].l_allele), constants::YON_REGEX_PACKED_ALLELES) && @@ -120,8 +125,8 @@ const bool MetaEntry::usePackedRefAlt(void) const{ return false; } -const BYTE MetaEntry::packRefAltByte(void) const{ - assert(this->usePackedRefAlt()); +BYTE MetaEntry::PackRefAltByte(void) const{ + assert(this->UsePackedRefAlt()); BYTE ref_alt = 0; // start out with empty if(this->alleles[0].l_allele == 9 && strncmp(this->alleles[0].allele, "", 9) == 0){ diff --git a/lib/core/meta_entry.h b/lib/core/meta_entry.h index d57f2b8..1bc83be 100644 --- a/lib/core/meta_entry.h +++ b/lib/core/meta_entry.h @@ -8,9 +8,10 @@ #include "core/header/variant_header.h" #include "meta_allele.h" #include "variant_controller.h" -#include "io/bcf/BCFEntry.h" #include "containers/data_container.h" +#include "htslib/vcf.h" + namespace tachyon{ namespace core{ @@ -23,62 +24,61 @@ struct MetaEntry{ typedef io::BasicBuffer buffer_type; typedef MetaAllele allele_type; typedef VariantController controller_type; - typedef bcf::BCFEntry bcf_entry_type; public: MetaEntry(); MetaEntry(const self_type& other); - MetaEntry(const bcf_entry_type& bcf_entry); // transmute data from bcfentry - MetaEntry(const bcf_entry_type& bcf_entry, const U64 position_offset); // transmute from bcfentry with positional offset + MetaEntry(const bcf1_t* record); + MetaEntry(const bcf1_t* record, const U64 position_offset); ~MetaEntry(); // Check if a field is set - inline const bool check_info_field(const datablock_footer_type& block, const U32 info_identifier) const{ - return(block.info_bit_vectors[this->info_pattern_id][info_identifier]); + inline bool CheckInfoField(const datablock_footer_type& block, const U32 info_identifier) const{ + return(block.info_patterns[this->info_pattern_id][info_identifier]); } - inline const bool check_format_field(const datablock_footer_type& block, const U32 format_identifier) const{ - return(block.format_bit_vectors[this->format_pattern_id][format_identifier]); + inline bool CheckFormatField(const datablock_footer_type& block, const U32 format_identifier) const{ + return(block.format_patterns[this->format_pattern_id][format_identifier]); } - inline const bool check_filter_field(const datablock_footer_type& block, const U32 filter_identifier) const{ - return(block.filter_bit_vectors[this->filter_pattern_id][filter_identifier]); + inline bool CheckFilterField(const datablock_footer_type& block, const U32 filter_identifier) const{ + return(block.filter_patterns[this->filter_pattern_id][filter_identifier]); } // Boolean checks // Supportive boolean functions - inline const bool hasGT(void) const{ return(this->controller.gt_available); } - inline const bool isBiallelic(void) const{ return(this->controller.biallelic); } - inline const bool isBiallelicSNV(void) const{ return(this->controller.biallelic == true && this->controller.simple_snv == true); } - inline const bool isDiploid(void) const{ return(this->controller.diploid); } - inline const bool isMixedPloidy(void) const{ return(this->controller.mixed_ploidy); } - inline const bool isAnyGTMissing(void) const{ return(this->controller.gt_anyMissing); } - inline const bool isAnyGTMixedPloidy(void) const{ return(this->controller.gt_anyNA); } - inline const bool isGTMixedPhasing(void) const{ return(this->controller.gt_mixed_phasing); } - inline const bool getControllerPhase(void) const{ return(this->controller.gt_phase); } - - inline const TACHYON_GT_ENCODING getGenotypeEncoding(void) const{ return(TACHYON_GT_ENCODING(this->controller.gt_compression_type)); } - inline const TACHYON_GT_PRIMITIVE_TYPE getGenotypeType(void) const{ return(TACHYON_GT_PRIMITIVE_TYPE(this->controller.gt_primtive_type)); } - - inline const float& getQuality(void){ return(this->quality); } - inline const std::string& getName(void){ return(this->name); } - inline const U16& getNumberAlleles(void){ return(this->n_alleles); } - inline const U32 getContigID(void){ return(this->contigID); } - inline const U64 getPosition(void){ return(this->position); } - - inline const float& getQuality(void) const{ return(this->quality); } - inline const std::string& getName(void) const{ return(this->name); } - inline const U16& getNumberAlleles(void) const{ return(this->n_alleles); } - inline const U32 getContigID(void) const{ return(this->contigID); } - inline const U64 getPosition(void) const{ return(this->position); } + inline bool HasGT(void) const{ return(this->controller.gt_available); } + inline bool IsBiallelic(void) const{ return(this->controller.biallelic); } + inline bool IsBiallelicSNV(void) const{ return(this->controller.biallelic == true && this->controller.simple_snv == true); } + inline bool IsDiploid(void) const{ return(this->controller.diploid); } + inline bool IsMixedPloidy(void) const{ return(this->controller.mixed_ploidy); } + inline bool IsAnyGTMissing(void) const{ return(this->controller.gt_anyMissing); } + inline bool IsAnyGTMixedPloidy(void) const{ return(this->controller.gt_anyNA); } + inline bool IsGTMixedPhasing(void) const{ return(this->controller.gt_mixed_phasing); } + inline bool GetControllerPhase(void) const{ return(this->controller.gt_phase); } + + inline TACHYON_GT_ENCODING GetGenotypeEncoding(void) const{ return(TACHYON_GT_ENCODING(this->controller.gt_compression_type)); } + inline TACHYON_GT_PRIMITIVE_TYPE GetGenotypeType(void) const{ return(TACHYON_GT_PRIMITIVE_TYPE(this->controller.gt_primtive_type)); } + + inline const float& GetQuality(void){ return(this->quality); } + inline const std::string& GetName(void){ return(this->name); } + inline const U16& GetNumberAlleles(void){ return(this->n_alleles); } + inline U32 GetContigID(void){ return(this->contigID); } + inline U64 GetPosition(void){ return(this->position); } + + inline const float& GetQuality(void) const{ return(this->quality); } + inline const std::string& GetName(void) const{ return(this->name); } + inline const U16& GetNumberAlleles(void) const{ return(this->n_alleles); } + inline U32 GetContigID(void) const{ return(this->contigID); } + inline U64 GetPosition(void) const{ return(this->position); } // Set and get for patterns - inline S32& getInfoPatternID(void){ return(this->info_pattern_id); } - inline S32& getFormatPatternID(void){ return(this->format_pattern_id); } - inline S32& getFilterPatternID(void){ return(this->filter_pattern_id); } - inline const S32& getInfoPatternID(void) const{ return(this->info_pattern_id); } - inline const S32& getFormatPatternID(void) const{ return(this->format_pattern_id); } - inline const S32& getFilterPatternID(void) const{ return(this->filter_pattern_id); } + inline S32& GetInfoPatternId(void){ return(this->info_pattern_id); } + inline S32& GetFormatPatternId(void){ return(this->format_pattern_id); } + inline S32& GetFilterPatternId(void){ return(this->filter_pattern_id); } + inline const S32& GetInfoPatternId(void) const{ return(this->info_pattern_id); } + inline const S32& GetFormatPatternId(void) const{ return(this->format_pattern_id); } + inline const S32& GetFilterPatternId(void) const{ return(this->filter_pattern_id); } /**< * Check if it is possible to pack the REF and ALT allele strings into @@ -86,7 +86,7 @@ struct MetaEntry{ * match the regular expression pattern "^([ATGCN\\.]{1}){1}|(){1}$" * @return Returns TRUE if it is possible to bitpack data or FALSE otherwise */ - const bool usePackedRefAlt(void) const; + bool UsePackedRefAlt(void) const; /**< * Bitpack biallelic, diploid REF and ALT data into a single BYTE. Failure @@ -94,11 +94,36 @@ struct MetaEntry{ * errors. * @return Returns a bitpacked BYTE */ - const BYTE packRefAltByte(void) const; + BYTE PackRefAltByte(void) const; + + std::string GetAlleleString(void) const{ + std::string ret = this->alleles[0].toString(); + for(U32 i = 1; i < this->n_alleles; ++i) + ret += "," + this->alleles[i].toString(); + return(ret); + } + + /**< + * Updates a htslib bcf1_t record with data available in this meta record. + * This function is used when converting yon1_t records to bcf1_t records. + * @param rec Input bcf1_t record that has been allocated. + * @param hdr Input bcf hdr structure converted from tachyon header. + * @return Returns the input bcf1_t record pointer. + */ + bcf1_t* UpdateHtslibVcfRecord(bcf1_t* rec, bcf_hdr_t* hdr) const{ + rec->rid = this->contigID; + rec->pos = this->position; + bcf_update_id(hdr, rec, this->name.data()); + bcf_update_alleles_str(hdr, rec, this->GetAlleleString().data()); + rec->qual = this->quality; + + return(rec); + } public: // Markup: populate from streams controller_type controller; + BYTE n_base_ploidy; U16 n_alleles; S32 info_pattern_id; // Info pattern ID S32 filter_pattern_id; // Filter pattern ID @@ -106,6 +131,17 @@ struct MetaEntry{ float quality; U64 contigID; U64 position; + // Todo: Add end_position_longest. This would allow us to directly query + // precomputed end positions. + // Todo: Axtend controller to U32 and add fields below or optionally add + // another controller field with variant-identifying information. + // This would support queries on type directly from precomputed + // lookups. + // Fields of interest: + // isSNV + // isComplex + // isIndel + // isSV std::string name; allele_type* alleles; }; diff --git a/lib/core/occ.cpp b/lib/core/occ.cpp new file mode 100644 index 0000000..ba3ab95 --- /dev/null +++ b/lib/core/occ.cpp @@ -0,0 +1,93 @@ +#include "occ.h" + +namespace tachyon{ + +bool yon_occ::ReadTable(const std::string file_name, + const VariantHeader& header, + const char delimiter) +{ + std::ifstream f; + f.open(file_name); + if(!f.good()){ + std::cerr << utility::timestamp("ERROR") << "Stream is bad! Cannot open " << file_name << "..." << std::endl; + return false; + } + + uint32_t n_line = 0; + std::string line; + // Iterate over available lines in the input file. + while(getline(f, line)){ + //std::cerr << line << std::endl; + + // Tokenize string with delimiter. + std::vector params = utility::split(line, delimiter, false); + + // Assert that the first column is an existing sample name. + const int32_t sample_id = header.GetSampleId(params[0]); + if(sample_id < 0){ + std::cerr << utility::timestamp("LOG") << "Cannot find sample \"" << params[0] << "\" in groupings file..." << std::endl; + continue; + } + + // Iterate over tokens. + for(U32 i = 1; i < params.size(); ++i){ + map_type::const_iterator it = this->map.find(params[i]); + if(it == this->map.end()){ + // Not already set + this->table.push_back(std::vector(header.GetNumberSamples() + 1, 0)); + this->table.back()[sample_id + 1] = true; + this->map[params[i]] = this->row_names.size(); + this->row_names.push_back(params[i]); + //std::cerr << "Adding group: " << params[i] << " for " << this->table.back().size() << " samples" << std::endl; + } else { + // Already set + this->table[it->second][sample_id + 1] = true; + } + } + } + + return true; +} + +bool yon_occ::BuildTable(void){ + this->occ.clear(); + if(this->table.size() == 0){ + return false; + } + + this->occ = std::vector< std::vector >(this->table.size(), std::vector( this->table[0].size(), 0)); + this->cum_sums = std::vector< uint32_t >( this->occ.size() ); + + for(U32 i = 0; i < this->table.size(); ++i){ + assert(this->table[i][0] == 0); + for(U32 j = 1; j < this->occ[i].size(); ++j) + this->occ[i][j] += this->occ[i][j-1] + this->table[i][j]; + + this->cum_sums[i] = this->occ[i].back(); + } + return true; +} + +bool yon_occ::BuildTable(const yon_gt_ppa& ppa){ + this->occ.clear(); + if(this->table.size() == 0){ + return false; + } + + assert(ppa.n_samples + 1 == this->table[0].size()); + + this->occ = std::vector< std::vector >(this->table.size(), std::vector( this->table[0].size(), 0)); + this->cum_sums = std::vector< uint32_t >( this->occ.size() ); + + for(U32 i = 0; i < this->table.size(); ++i){ + assert(this->table[i][0] == 0); + for(U32 j = 1; j < this->occ[i].size(); ++j) + this->occ[i][j] += this->occ[i][j - 1] + this->table[i][ppa[j - 1] + 1]; + + assert(this->occ[i][0] == 0); + this->cum_sums[i] = this->occ[i].back(); + } + return true; +} + +} diff --git a/lib/core/occ.h b/lib/core/occ.h new file mode 100644 index 0000000..d50a9bf --- /dev/null +++ b/lib/core/occ.h @@ -0,0 +1,45 @@ +#ifndef CORE_OCC_H_ +#define CORE_OCC_H_ + +#include +#include +#include +#include +#include +#include + +#include "support/helpers.h" +#include "header/variant_header.h" +#include "genotypes.h" + +namespace tachyon{ + +struct yon_occ { + typedef std::unordered_map map_type; + + yon_occ() = default; + ~yon_occ() = default; + + bool ReadTable(const std::string file_name, const VariantHeader& header, const char delimiter = '\t'); + bool BuildTable(void); + bool BuildTable(const yon_gt_ppa& ppa); + + // Map from group name to row offset in the table. + map_type map; + // Unique names of grouping factors. + std::vector row_names; + // Total cumulative sums for each row. + std::vector cum_sums; + + // a matrix with proportions samples times groupings + // rows corresponds to the cumulative sum of a grouping + // over the samples. The table corresponds to the set + // membership (presence or absence) and the occ table + // corresponds to the cumsum. + std::vector< std::vector > table; + std::vector< std::vector > occ; +}; + +} + +#endif /* CORE_OCC_H_ */ diff --git a/lib/core/ts_tv_object.h b/lib/core/ts_tv_object.h index e6cae85..dfa4a61 100644 --- a/lib/core/ts_tv_object.h +++ b/lib/core/ts_tv_object.h @@ -1,7 +1,6 @@ #ifndef CORE_TS_TV_OBJECT_H_ #define CORE_TS_TV_OBJECT_H_ -#include "genotype_summary.h" #include "meta_entry.h" #include "support/type_definitions.h" @@ -12,7 +11,6 @@ struct TsTvObject{ private: typedef TsTvObject self_type; typedef MetaEntry meta_type; - typedef containers::GenotypeSummaryObject gtsum_type; public: TsTvObject() : @@ -47,17 +45,12 @@ struct TsTvObject{ return(*this); } - inline const double getTiTVRatio(void) const{ + inline double getTiTVRatio(void) const{ // Prevent division by 0 if(this->n_transversions == 0) return 0; return((double)this->n_transitions / this->n_transversions); } - // Todo: - void update(const gtsum_type& gtsum, const meta_type& meta){ - - } - private: friend std::ostream& operator<<(std::ostream& out, const self_type& entry){ U64 n_total_variants = 0; diff --git a/lib/core/variant_controller.h b/lib/core/variant_controller.h index a943bff..97454ac 100644 --- a/lib/core/variant_controller.h +++ b/lib/core/variant_controller.h @@ -89,7 +89,7 @@ struct VariantController{ this->all_snv = other->all_snv; } - inline const U16 toValue(void) const{ return((U16)*reinterpret_cast(this)); } + inline U16 toValue(void) const{ return((U16)*reinterpret_cast(this)); } /**< Controller field. The first seven fields describes * genotype-specific information. The remainder bit-fields diff --git a/lib/core/variant_import_writer.h b/lib/core/variant_import_writer.h index d60b92e..69a1a4c 100644 --- a/lib/core/variant_import_writer.h +++ b/lib/core/variant_import_writer.h @@ -5,7 +5,8 @@ #include #include "index/index.h" -#include "support/type_definitions.h" +#include "support/magic_constants.h" +#include "containers/data_container.h" namespace tachyon { @@ -13,6 +14,7 @@ class VariantImportWriterInterface { private: typedef VariantImportWriterInterface self_type; typedef index::Index sorted_index_type; + typedef containers::DataContainer container_type; public: VariantImportWriterInterface(); @@ -21,6 +23,22 @@ class VariantImportWriterInterface { void writeIndex(void); virtual bool open(const std::string output) =0; + bool WriteBlockFooter(const container_type& footer){ + if(this->stream == nullptr) return false; + const U64 start_footer_pos = this->stream->tellp(); + utility::SerializePrimitive(footer.header.data_header.uLength, *this->stream); + utility::SerializePrimitive(footer.header.data_header.cLength, *this->stream); + this->stream->write(reinterpret_cast(&footer.header.data_header.crc[0]), MD5_DIGEST_LENGTH); + *this->stream << footer.buffer_data; + return(this->stream->good()); + } + + bool WriteEndOfBlock(void){ + if(this->stream == nullptr) return false; + utility::SerializePrimitive(constants::TACHYON_BLOCK_EOF, *this->stream); + return(this->stream->good()); + } + public: U64 n_blocks_written; U64 n_variants_written; diff --git a/lib/core/variant_importer_container_stats.h b/lib/core/variant_importer_container_stats.h index 5e23395..c94312a 100644 --- a/lib/core/variant_importer_container_stats.h +++ b/lib/core/variant_importer_container_stats.h @@ -14,8 +14,8 @@ struct VariantImporterStatsObject{ ~VariantImporterStatsObject(){} void operator+=(const data_container_type& container){ - this->cost_uncompressed += container.getObjectSizeUncompressed(); - this->cost_compressed += container.getObjectSize(); + this->cost_uncompressed += container.GetObjectSizeUncompressed(); + this->cost_compressed += container.GetObjectSize(); } friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ @@ -111,7 +111,7 @@ class VariantImporterContainerStats{ inline const_reference back(void) const{ return(this->entries_[this->n_entries_ - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries_ == 0); } + inline bool empty(void) const{ return(this->n_entries_ == 0); } inline const size_type& size(void) const{ return(this->n_entries_); } inline const size_type& capacity(void) const{ return(this->n_capacity_); } diff --git a/lib/core/variant_reader_filters.cpp b/lib/core/variant_reader_filters.cpp new file mode 100644 index 0000000..c2642a6 --- /dev/null +++ b/lib/core/variant_reader_filters.cpp @@ -0,0 +1,54 @@ +#include "variant_reader_filters.h" + +namespace tachyon{ + +VariantReaderFilters::VariantReaderFilters() : + n_filters_(0), + n_capacity_(256), + require_genotypes(false), + target_intervals(false), + filter_data_(new pointer[this->n_capacity_]) +{ + +} + +VariantReaderFilters::~VariantReaderFilters(){ + if(this->filter_data_ != nullptr){ + for(U32 i = 0; i < this->n_filters_; ++i) + delete this->filter_data_[i]; + + delete [] this->filter_data_; + } +} + +bool VariantReaderFilters::filterAlleleFrequency(const_pointer pair, const yon1_t& objects, const U32& position) const{ + for(U32 i = 3; i < objects.gt_sum->d->n_ac_af; ++i){ + if(pair->applyFilter(objects.gt_sum->d->af[i])) + return true; + } + return(false); +} + + +bool VariantReaderFilters::filterUnseenAlternativeAlleles(const_pointer pair, const yon1_t& objects, const U32& position) const{ + for(U32 i = 3; i < objects.gt_sum->d->n_ac_af; ++i){ + if(pair->applyFilter(objects.gt_sum->d->ac[i] + objects.gt_sum->d->ac[i] == 0)) + return true; + } + return false; +} + +bool VariantReaderFilters::filter(yon1_t& objects, const U32 position) const{ + if(this->require_genotypes) + objects.EvaluateSummary(true); + + for(U32 i = 0 ; i < this->filters.size(); ++i){ + if((this->*(this->filters[i]))(this->filter_data_[i], objects, position) == false){ + return false; + } + } + return true; +} + + +} diff --git a/lib/core/variant_reader_filters.h b/lib/core/variant_reader_filters.h index 2601ab7..312625b 100644 --- a/lib/core/variant_reader_filters.h +++ b/lib/core/variant_reader_filters.h @@ -1,6 +1,7 @@ #ifndef CONTAINERS_VARIANT_READER_FILTERS_H_ #define CONTAINERS_VARIANT_READER_FILTERS_H_ +#include "core/variant_record.h" #include "variant_reader_filters_tuple.h" #include "variant_reader_objects.h" @@ -33,197 +34,108 @@ struct VariantReaderFilters{ typedef const value_type* const_pointer; typedef std::ptrdiff_t difference_type; typedef std::size_t size_type; - typedef bool (self_type::*filter_function)(const_pointer pair, const objects_type& objects, const U32& position) const; + + typedef bool (self_type::*filter_function)(const_pointer pair, const yon1_t& objects, const U32& position) const; typedef bool (self_type::*family_filter_function)(void) const; public: - VariantReaderFilters() : - n_filters_(0), - n_capacity_(256), - filter_data_(new pointer[this->n_capacity_]), - require_genotypes(false), - target_intervals(false) - { - - } - - ~VariantReaderFilters(){ - if(this->filter_data_ != nullptr){ - for(U32 i = 0; i < this->n_filters_; ++i) - delete this->filter_data_[i]; - - delete [] this->filter_data_; - } - } - + VariantReaderFilters(); + ~VariantReaderFilters(); VariantReaderFilters(const VariantReaderFilters& other) = delete; template - void add(TACHYON_FILTER_FUNCTION filter_function, const T& r_value, const TACHYON_COMPARATOR_TYPE& comparator){ - // Todo: currently if full then return: fix to resize and update - if(this->size() + 1 == this->capacity()) - return; - - // Construct new filter function - this->filter_data_[this->n_filters_++] = new VariantReaderFiltersTuple(r_value, comparator); - - switch(filter_function){ - case(YON_FILTER_NUMBER_ALT_ALLELES): - this->filters.push_back(&self_type::filterAlternativeAlleles); - break; - case(YON_FILTER_MIXED_PHASING): - this->filters.push_back(&self_type::filterMixedPhasing); - break; - case(YON_FILTER_MIXED_PLOIDY): - this->filters.push_back(&self_type::filterMixedPloidy); - break; - case(YON_FILTER_MISSING_GT): - this->filters.push_back(&self_type::filterHasMissingGenotypes); - break; - case(YON_FILTER_ALLELE_FREQUENCY): - this->filters.push_back(&self_type::filterAlleleFrequency); - break; - case(YON_FILTER_ALLELE_COUNT): - this->filters.push_back(&self_type::filterAlleleCount); - break; - case(YON_FILTER_UNIFORM_PHASE): - this->filters.push_back(&self_type::filterUniformMatchPhase); - break; - case(YON_FILTER_KNOWN_NOVEL): - this->filters.push_back(&self_type::filterKnownNovel); - break; - case(YON_FILTER_REFERENCE_ALLELE): - this->filters.push_back(&self_type::filterReferenceAllele); - break; - case(YON_FILTER_ALT_ALLELE): - this->filters.push_back(&self_type::filterAlternativeAllele); - break; - case(YON_FILTER_NAME): - this->filters.push_back(&self_type::filterName); - break; - case(YON_FILTER_UNSEEN_ALT): - this->filters.push_back(&self_type::filterUnseenAlternativeAlleles); - break; - case(YON_FILTER_QUALITY): - this->filters.push_back(&self_type::filterQuality); - break; - } - } + void add(TACHYON_FILTER_FUNCTION filter_function, const T& r_value, const TACHYON_COMPARATOR_TYPE& comparator); + // Capacity inline const size_type& size(void) const{ return(this->n_filters_); } inline const size_type& capacity(void) const{ return(this->n_capacity_); } // Has mixed phasing - inline bool filterMixedPhasing(const_pointer pair, const objects_type& objects, const U32& position) const{ - return(pair->applyFilter(objects.meta_container->at(position).isGTMixedPhasing())); - } - - inline bool filterMixedPloidy(const_pointer pair, const objects_type& objects, const U32& position) const{ - return(pair->applyFilter((objects.genotype_summary->vectorA_[1] + objects.genotype_summary->vectorB_[1]))); + inline bool filterMixedPhasing(const_pointer pair, const yon1_t& objects, const U32& position) const{ + return(pair->applyFilter(objects.meta->IsGTMixedPhasing())); } - inline bool filterKnownNovel(const_pointer pair, const objects_type& objects, const U32& position) const{ - return(pair->applyFilter((U32)objects.meta_container->at(position).name.size())); + inline bool filterKnownNovel(const_pointer pair, const yon1_t& objects, const U32& position) const{ + return(pair->applyFilter((U32)objects.meta->name.size())); } - inline bool filterQuality(const_pointer pair, const objects_type& objects, const U32& position) const{ - return(pair->applyFilter(objects.meta_container->at(position).quality)); + inline bool filterQuality(const_pointer pair, const yon1_t& objects, const U32& position) const{ + return(pair->applyFilter(objects.meta->quality)); } // GT data matches this - inline bool filterUniformMatchPhase(const_pointer pair, const objects_type& objects, const U32& position) const - { - if(objects.meta_container->at(position).isGTMixedPhasing() == true) return false; - return(pair->applyFilter(objects.meta_container->at(position).controller.gt_phase)); + inline bool filterUniformMatchPhase(const_pointer pair, const yon1_t& objects, const U32& position) const{ + if(objects.meta->IsGTMixedPhasing() == true) return false; + return(pair->applyFilter(objects.meta->controller.gt_phase)); } - bool filterPloidy(const_pointer pair, const objects_type& objects, const U32& position) const; - bool filterSampleList(const_pointer pair, const objects_type& objects, const U32& position) const; - // BCFtools calculate this as the SUM of all ALT counts // We filter based on ANY ALT frequency OPERATOR the target frequency - bool filterAlleleFrequency(const_pointer pair, const objects_type& objects, const U32& position) const{ - const std::vector af = objects.genotype_summary->calculateAlleleFrequency(objects.meta_container->at(position)); - for(U32 i = 1; i < af.size(); ++i){ - if(pair->applyFilter(af[i])) - return true; - } - return(false); - } - - bool filterVariantClassification(const_pointer pair, const objects_type& object, const U32& position) const; + bool filterAlleleFrequency(const_pointer pair, const yon1_t& objects, const U32& position) const; + bool filterUnseenAlternativeAlleles(const_pointer pair, const yon1_t& objects, const U32& position) const; - bool filterUnseenAlternativeAlleles(const_pointer pair, const objects_type& object, const U32& position) const{ - for(U32 i = 0; i < object.meta_container->at(position).n_alleles; ++i){ - if(pair->applyFilter(object.genotype_summary->vectorA_[2+i] + object.genotype_summary->vectorB_[2+i] == 0)) - return true; - } - return false; - } - - bool filterFILTER(const_pointer pair, const objects_type& object, const U32& position) const; // Filter by desired FILTER values - bool filterINFO(const_pointer pair, const objects_type& object, const U32& position) const; // custom filter. e.g. AC<1024 - - inline bool filterAlternativeAlleles(const_pointer pair, const objects_type& object, const U32& position) const{ + inline bool filterAlternativeAlleles(const_pointer pair, const yon1_t& objects, const U32& position) const{ // Remove one to total count as REF is counted here // Recast as signed integer to avoid possible underflowing issues - return(pair->applyFilter(object.meta_container->at(position).getNumberAlleles() - 1)); + return(pair->applyFilter(objects.meta->GetNumberAlleles() - 1)); } - inline bool filterAlleleCount(const_pointer pair, const objects_type& object, const U32& position) const{ - for(U32 i = 1; i < object.meta_container->at(position).n_alleles; ++i){ - if(pair->applyFilter(object.genotype_summary->vectorA_[2+i] + object.genotype_summary->vectorB_[2+i])){ + inline bool filterAlleleCount(const_pointer pair, const yon1_t& objects, const U32& position) const{ + for(U32 i = 3; i < objects.gt_sum->d->n_ac_af; ++i){ + if(pair->applyFilter((U32)objects.gt_sum->d->ac[i])){ return true; } } return false; } - inline bool filterHasMissingGenotypes(const_pointer pair, const objects_type& object, const U32& position) const{ - return(pair->applyFilter(object.genotype_summary->vectorA_[1])); + // Unused parameter position available in definition to allow a unified pointer definition + inline bool filterHasMissingGenotypes(const_pointer pair, const yon1_t& objects, const U32& position) const{ + return(pair->applyFilter((U32)objects.gt_sum->d->ac[0])); + } + + // Unused parameter position available in definition to allow a unified pointer definition + inline bool filterMixedPloidy(const_pointer pair, const yon1_t& objects, const U32& position) const{ + return(pair->applyFilter((U32)objects.gt_sum->d->ac[1])); } - inline bool filterReferenceAllele(const_pointer pair, const objects_type& object, const U32& position) const{ - //std::cerr << object.meta->at(position).alleles[0].toString() << std::endl; - return(pair->applyFilter(object.meta_container->at(position).alleles[0].toString())); + inline bool filterReferenceAllele(const_pointer pair, const yon1_t& objects, const U32& position) const{ + //std::cerr << objects.meta->at(position).alleles[0].toString() << std::endl; + return(pair->applyFilter(objects.meta->alleles[0].toString())); } - inline bool filterAlternativeAllele(const_pointer pair, const objects_type& object, const U32& position) const{ - for(U32 i = 1; i < object.meta_container->at(position).n_alleles; ++i){ - if(pair->applyFilter(object.meta_container->at(position).alleles[i].toString())) + inline bool filterAlternativeAllele(const_pointer pair, const yon1_t& objects, const U32& position) const{ + for(U32 i = 1; i < objects.meta->n_alleles; ++i){ + if(pair->applyFilter(objects.meta->alleles[i].toString())) return true; } return false; } - inline bool filterName(const_pointer pair, const objects_type& object, const U32& position) const{ - return(pair->applyFilter(object.meta_container->at(position).name)); + inline bool filterName(const_pointer pair, const yon1_t& objects, const U32& position) const{ + return(pair->applyFilter(objects.meta->name)); } + // Not implemented + bool filterPloidy(const_pointer pair, const yon1_t& objects, const U32& position) const; + bool filterSampleList(const_pointer pair, const yon1_t& objects, const U32& position) const; + bool filterVariantClassification(const_pointer pair, const yon1_t& objects, const U32& position) const; + bool filterFILTER(const_pointer pair, const yon1_t& objects, const U32& position) const; // Filter by desired FILTER values + bool filterINFO(const_pointer pair, const yon1_t& objects, const U32& position) const; // custom filter. e.g. AC<1024 + /**< * Iteratively apply filters in the filter pointer vector * @param objects Target objects container structure * @param position Target position (relative loci) in the container * @return Returns TRUE if passes filtering or FALSE otherwise */ - bool filter(const objects_type& objects, const U32 position) const{ - // Todo: construct genotype summary globally for this variant - if(this->require_genotypes) - objects.genotype_container->at(position).getSummary(*objects.genotype_summary); - - for(U32 i = 0 ; i < this->filters.size(); ++i){ - if((this->*(this->filters[i]))(this->filter_data_[i], objects, position) == false){ - return false; - } - } - return true; - } + bool filter(yon1_t& objects, const U32 position) const; /**< * Checks if any filter function require genotype data to be loaded and prepared * @return Returns TRUE if genotype data is required or FALSE otherwise */ - inline const bool doRequireGenotypes(void) const{ return(this->require_genotypes); } + inline bool doRequireGenotypes(void) const{ return(this->require_genotypes); } public: size_type n_filters_; // number of filters @@ -235,6 +147,59 @@ struct VariantReaderFilters{ }; +template +void VariantReaderFilters::add(TACHYON_FILTER_FUNCTION filter_function, const T& r_value, const TACHYON_COMPARATOR_TYPE& comparator){ + // Todo: currently if full then return: fix to resize and update + if(this->size() + 1 == this->capacity()) + return; + + // Construct new filter function + this->filter_data_[this->n_filters_++] = new VariantReaderFiltersTuple(r_value, comparator); + + switch(filter_function){ + case(YON_FILTER_NUMBER_ALT_ALLELES): + this->filters.push_back(&self_type::filterAlternativeAlleles); + break; + case(YON_FILTER_MIXED_PHASING): + this->filters.push_back(&self_type::filterMixedPhasing); + break; + case(YON_FILTER_MIXED_PLOIDY): + this->filters.push_back(&self_type::filterMixedPloidy); + break; + case(YON_FILTER_MISSING_GT): + this->filters.push_back(&self_type::filterHasMissingGenotypes); + break; + case(YON_FILTER_ALLELE_FREQUENCY): + this->filters.push_back(&self_type::filterAlleleFrequency); + break; + case(YON_FILTER_ALLELE_COUNT): + this->filters.push_back(&self_type::filterAlleleCount); + break; + case(YON_FILTER_UNIFORM_PHASE): + this->filters.push_back(&self_type::filterUniformMatchPhase); + break; + case(YON_FILTER_KNOWN_NOVEL): + this->filters.push_back(&self_type::filterKnownNovel); + break; + case(YON_FILTER_REFERENCE_ALLELE): + this->filters.push_back(&self_type::filterReferenceAllele); + break; + case(YON_FILTER_ALT_ALLELE): + this->filters.push_back(&self_type::filterAlternativeAllele); + break; + case(YON_FILTER_NAME): + this->filters.push_back(&self_type::filterName); + break; + case(YON_FILTER_UNSEEN_ALT): + this->filters.push_back(&self_type::filterUnseenAlternativeAlleles); + break; + case(YON_FILTER_QUALITY): + this->filters.push_back(&self_type::filterQuality); + break; + } +} + + } diff --git a/lib/core/variant_reader_filters_tuple.h b/lib/core/variant_reader_filters_tuple.h index 78bfa9c..4ab99c8 100644 --- a/lib/core/variant_reader_filters_tuple.h +++ b/lib/core/variant_reader_filters_tuple.h @@ -3,6 +3,7 @@ #include +#include "support/helpers.h" #include "support/enums.h" namespace tachyon{ @@ -64,6 +65,10 @@ struct VariantReaderFiltersTuple : public VariantReaderFiltersTupleInterface{ case(YON_CMP_LESS_EQUAL): this->comparator = &self_type::__filterLesserEqual; break; case(YON_CMP_EQUAL): this->comparator = &self_type::__filterEqual; break; case(YON_CMP_NOT_EQUAL): this->comparator = &self_type::__filterNotEqual; break; + case(YON_CMP_REGEX): + default: + std::cerr << utility::timestamp("ERROR","FILTER") << "Numerical filtering operations do not support regular expression operations..." << std::endl; + this->comparator = &self_type::__filterEqual; } } @@ -83,6 +88,7 @@ struct VariantReaderFiltersTuple : public VariantReaderFiltersTupleInterface{ case(YON_CMP_LESS_EQUAL): this->comparator = &self_type::__filterLesserEqual; break; case(YON_CMP_EQUAL): this->comparator = &self_type::__filterEqual; break; case(YON_CMP_NOT_EQUAL): this->comparator = &self_type::__filterNotEqual; break; + case(YON_CMP_REGEX): default: std::cerr << utility::timestamp("ERROR","FILTER") << "Numerical filtering operations do not support regular expression operations..." << std::endl; this->comparator = &self_type::__filterEqual; diff --git a/lib/core/variant_reader_objects.h b/lib/core/variant_reader_objects.h index f11e721..4609993 100644 --- a/lib/core/variant_reader_objects.h +++ b/lib/core/variant_reader_objects.h @@ -1,17 +1,20 @@ #ifndef CONTAINERS_VARIANT_READER_OBJECTS_H_ #define CONTAINERS_VARIANT_READER_OBJECTS_H_ +#include + #include "containers/meta_container.h" #include "containers/genotype_container.h" #include "containers/info_container.h" #include "containers/info_container_string.h" #include "containers/format_container.h" #include "containers/format_container_string.h" +#include "occ.h" namespace tachyon{ /**< - * The sole function of this struct is to keep all loaded containers and + * The function of this struct is to keep all loaded containers and * field identifiers in a single object. Loading the members of this object * occurs OUTSIDE this definition. */ @@ -22,7 +25,7 @@ struct VariantReaderObjects{ typedef containers::GenotypeContainer gt_container_type; typedef containers::InfoContainerInterface info_interface_type; typedef containers::FormatContainerInterface format_interface_type; - typedef containers::GenotypeSummary genotype_summary_type; + typedef yon_gt_summary genotype_summary_type; public: VariantReaderObjects() : @@ -34,19 +37,34 @@ struct VariantReaderObjects{ genotype_container(nullptr), genotype_summary(nullptr), info_containers(nullptr), - format_containers(nullptr) - {} + format_containers(nullptr), + occ(nullptr) + { + } ~VariantReaderObjects(){ delete this->meta_container; delete this->genotype_container; delete this->genotype_summary; - for(U32 i = 0; i < this->n_loaded_info; ++i) delete this->info_containers[i]; + for(U32 i = 0; i < this->info_id_loaded.size(); ++i) + delete this->info_containers[this->info_id_loaded[i]]; delete [] this->info_containers; - for(U32 i = 0; i < this->n_loaded_format; ++i) delete this->format_containers[i]; + for(U32 i = 0; i < this->format_id_loaded.size(); ++i) + delete this->format_containers[this->format_id_loaded[i]]; delete [] this->format_containers; + delete this->occ; + } + + bool EvaluateOcc(){ + if(occ == nullptr) return false; + return(this->occ->BuildTable()); + } + + bool EvaluateOcc(yon_gt_ppa* ppa){ + if(occ == nullptr) return(this->EvaluateOcc()); + return(this->occ->BuildTable(*ppa)); } public: @@ -54,20 +72,17 @@ struct VariantReaderObjects{ bool loaded_meta; size_t n_loaded_info; size_t n_loaded_format; - std::vector info_id_fields_keep; - std::vector format_id_fields_keep; - std::vector additional_info_execute_flag_set; - std::vector< std::vector > local_match_keychain_info; - std::vector< std::vector > local_match_keychain_format; - - std::vector info_field_names; - std::vector format_field_names; + std::vector info_id_loaded; + std::vector format_id_loaded; + std::unordered_map info_container_map; + std::unordered_map format_container_map; meta_container_type* meta_container; gt_container_type* genotype_container; genotype_summary_type* genotype_summary; info_interface_type** info_containers; format_interface_type** format_containers; + yon_occ* occ; }; } diff --git a/lib/core/variant_reader_settings.h b/lib/core/variant_reader_settings.h index 6fcfe34..19b5970 100644 --- a/lib/core/variant_reader_settings.h +++ b/lib/core/variant_reader_settings.h @@ -18,13 +18,10 @@ struct VariantReaderSettings{ drop_format(false), header_only(false), show_header(true), - custom_delimiter(false), - custom_delimiter_char(0), - custom_output_format(false), - filter_any(false), - filter_all(false), annotate_genotypes(false), - output_FORMAT_as_vector(false) + use_htslib(false), + output("-"), + output_type('v') {} ~VariantReaderSettings() = default; @@ -49,17 +46,12 @@ struct VariantReaderSettings{ bool drop_format; // drop FORMAT fields bool header_only; // show only the VCF header bool show_header; // show the VCF header - bool custom_delimiter; // output uses a custom delimiter - char custom_delimiter_char; // what is the custom delimiter - bool custom_output_format; // output has a custom format - bool filter_any; // filter output - bool filter_all; // filter bool annotate_genotypes; - bool output_FORMAT_as_vector; + bool use_htslib; std::string input; std::string output; std::string keychain_file; - std::string output_type; + char output_type; std::string sample_names_file; std::vector sample_names; }; diff --git a/lib/core/variant_record.h b/lib/core/variant_record.h new file mode 100644 index 0000000..7d9c3b7 --- /dev/null +++ b/lib/core/variant_record.h @@ -0,0 +1,134 @@ +#ifndef CORE_VARIANT_RECORD_H_ +#define CORE_VARIANT_RECORD_H_ + +#include "containers/meta_container.h" +#include "containers/genotype_container.h" +#include "containers/info_container.h" +#include "containers/info_container_string.h" +#include "containers/format_container.h" +#include "containers/format_container_string.h" +#include "occ.h" + +namespace tachyon{ + +// Forward declare to allow variant to reference +// parent (host) container. +namespace containers { +class VariantBlockContainer; +} + +struct yon1_t { + yon1_t(void) : + is_dirty(false), + is_loaded_meta(false), + is_loaded_gt(false), + n_format(0), n_info(0), n_filter(0), + id_block(0), + meta(nullptr), + gt(nullptr), + gt_sum(nullptr), + occ(nullptr), + info(nullptr), + fmt(nullptr), + info_containers(nullptr), + format_containers(nullptr), + gt_i(nullptr), + info_ids(nullptr), + format_ids(nullptr), + filter_ids(nullptr), + parent_container(nullptr) + { + } + + ~yon1_t(void){ + delete [] this->info; + delete [] this->fmt; + delete [] this->info_containers; + delete [] this->format_containers; + delete this->gt; + delete this->gt_sum; + // Do not delete occ. It is always borrowed! + } + + bool EvaluateSummary(bool lazy_evaluate = true){ + assert(this->gt != nullptr); + assert(this->gt->rcds != nullptr); + + if(this->gt_sum != nullptr) + return true; + + this->gt_sum = new yon_gt_summary(this->gt->m, this->gt->n_allele); + *this->gt_sum += *this->gt; + if(lazy_evaluate) this->gt_sum->LazyEvaluate(); + return true; + } + + bool EvaluateOcc(){ + assert(this->gt != nullptr); + assert(occ != nullptr); + + this->gt->n_o = occ->occ.size(); + this->gt->n_occ = new uint32_t[this->gt->n_o]; + this->gt->d_occ = new yon_gt_rcd*[this->gt->n_o]; + + for(U32 i = 0; i < this->gt->n_o; ++i){ + this->gt->d_occ[i] = new yon_gt_rcd[this->gt->n_i]; + + uint32_t cum_sum = 0; // Total cumulative genotypes observed. + uint32_t cum_sum_hit = 0; // Number of non-zero runs observed. + uint32_t n_offset = 0; // Virtual offset in destination array. + + // Iterate over available gt rcds. + for(U32 j = 0; j < this->gt->n_i; ++j){ + const uint32_t to = this->occ->occ[i][cum_sum + this->gt->rcds[j].run_length]; + const uint32_t from = this->occ->occ[i][cum_sum]; + if(to - from != 0){ + // Allocate memory for alleles. + this->gt->d_occ[i][n_offset].allele = new uint8_t[this->gt->m]; + + // Copy allelic data from recerence gt rcd. + for(U32 k = 0; k < this->gt->m; ++k){ + this->gt->d_occ[i][n_offset].allele[k] = this->gt->rcds[j].allele[k]; + } + + // Set run-length representation. + this->gt->d_occ[i][n_offset].run_length = to - from; + assert(n_offset < this->gt->n_i); + ++n_offset; + cum_sum_hit += to - from; + } + cum_sum += this->gt->rcds[j].run_length; + } + assert(cum_sum == this->gt->n_s); + assert(cum_sum_hit == this->occ->cum_sums[i]); + this->gt->n_occ[i] = n_offset; + } + return(true); + } + + bool is_dirty; // if data has been modified in the raw buffer but not the containers + bool is_loaded_meta; + bool is_loaded_gt; + uint16_t n_format, n_info, n_filter; + uint32_t id_block; // incremental id in the block container + core::MetaEntry* meta; + yon_gt* gt; + yon_gt_summary* gt_sum; + yon_occ* occ; + containers::PrimitiveContainerInterface** info; + containers::PrimitiveGroupContainerInterface** fmt; + containers::InfoContainerInterface** info_containers; + containers::FormatContainerInterface** format_containers; + containers::GenotypeContainerInterface* gt_i; + std::vector info_hdr; + std::vector format_hdr; + std::vector filter_hdr; + std::vector* info_ids; + std::vector* format_ids; + std::vector* filter_ids; + containers::VariantBlockContainer* parent_container; +}; + +} + +#endif /* CORE_VARIANT_RECORD_H_ */ diff --git a/lib/import.h b/lib/import.h index 965938e..c09b40f 100644 --- a/lib/import.h +++ b/lib/import.h @@ -29,17 +29,17 @@ DEALINGS IN THE SOFTWARE. void import_usage(void){ programMessage(); std::cerr << - "Brief: Convert BCF -> YON\n" - "Usage: " << tachyon::constants::PROGRAM_NAME << " import [options] -i -o \n\n" + "Brief: Convert Vcf/Bcf records into a Yon archive.\n" + "Usage: " << tachyon::constants::PROGRAM_NAME << " import [options] -i -o \n\n" "Options:\n" - " -i FILE input BCF file (required)\n" + " -i FILE input Vcf/Bcf/Vcf.gz file (required)\n" " -o FILE output file prefix (required)\n" " -c INT Import checkpoint size in number of variants (default: 1000)\n" " -C FLOAT Import checkpoint size in bases (default: 5 Mb)\n" " -L INT Compression level 1-20 (default: 6)\n" " -t INT Number of compression threads (default: all available)\n" " -p/-P Permute/Do not permute diploid genotypes\n" - " -e Encrypt data (default AES-256)\n" + " -e Encrypt data with AES-256\n" " -d Drop invariant sites (all REF or ALT)\n" " -s Hide all program messages [null]\n"; } diff --git a/lib/index/index.cpp b/lib/index/index.cpp new file mode 100644 index 0000000..c798726 --- /dev/null +++ b/lib/index/index.cpp @@ -0,0 +1,129 @@ +#include + +#include "index.h" + +namespace tachyon{ +namespace index{ + +bool Index::buildMetaIndex(void){ + // This criterion should never be satisfied + if(this->index_.size() == 0) + return false; + + for(U32 c = 0; c < this->index_.size(); ++c){ // foreach contig + if(this->index_.linear_at(c).size() == 0){ + this->index_meta_ += entry_meta_type(); + continue; + } + + entry_meta_type indexindex; + indexindex(this->index_.linear_at(c)[0]); // Start reference + for(U32 i = 1; i < this->index_.linear_at(c).size(); ++i){ + if(indexindex == this->index_.linear_at(c)[i]) // If the blocks share the same contig identifier + indexindex += this->index_.linear_at(c)[i]; + else { // Otherwise push one entry onto the chain and start a new reference + this->index_meta_ += indexindex; + indexindex(this->index_.linear_at(c)[i]); + } + } + + this->index_meta_ += indexindex; + } + + return true; +} + +std::vector Index::findOverlap(const U32& contig_id) const{ + if(contig_id > this->getMetaIndex().size()) + return(std::vector()); + + std::vector yon_blocks; + for(U32 i = 0; i < this->getIndex().linear_at(contig_id).size(); ++i){ + yon_blocks.push_back(this->getIndex().linear_at(contig_id).at(i)); + } + + return(yon_blocks); +} + +std::vector Index::findOverlap(const U32& contig_id, const U64& start_pos, const U64& end_pos) const{ + if(contig_id > this->getMetaIndex().size()) + return(std::vector()); + + + if(this->getMetaIndex().at(contig_id).n_blocks == 0){ + return(std::vector()); + } + + const U32 block_offset_start = this->getIndex().linear_at(contig_id).at(0).blockID; + + //std::cerr << "Linear index " << this->getIndex().linear_at(contig_id).size() << std::endl; + //for(U32 i = 0; i < this->getIndex().linear_at(contig_id).size(); ++i){ + // this->getIndex().linear_at(contig_id).at(i).print(std::cerr) << std::endl; + //} + //std::cerr << "block offset: " << block_offset_start << std::endl; + + // We also need to know possible overlaps in the quad-tree: + // Seek from root to origin in quad-tree for potential overlapping bins with counts > 0 + + // Retrieve vector of bins that might contain the data + // The possibleBins function does not check if they exist + std::vector possible_chunks = this->index_[contig_id].possibleBins(start_pos, end_pos); + std::vector yon_blocks; + //std::cerr << "Possible chunks: " << possible_chunks.size() << std::endl; + + // Check if possible bins exists in the linear index + for(U32 i = 0; i < possible_chunks.size(); ++i){ + // Cycle over the YON blocks this bin have data mapping to + for(U32 j = 0; j < possible_chunks[i].size(); ++j){ + const U32 used_bins = possible_chunks[i][j] - block_offset_start; + //std::cerr << i << "/" << possible_chunks[i].size() << ",raw bin: " << possible_chunks[i][j] << ", used bin: " << used_bins << "\t"; + //possible_chunks[i].print(std::cerr) << std::endl; + //std::cerr << "Comparing: " << this->getIndex().linear_at(contig_id)[used_bins].minPosition << "<" << end_pos + // << " and " << this->getIndex().linear_at(contig_id)[used_bins].maxPosition << ">" << start_pos << std::endl; + + // Check [a, b] overlaps with [x, y] iff b > x and a < y. + // a = this->getIndex().linear_at(contig_id)[possible_bins[i][j]].minPosition; + // b = this->getIndex().linear_at(contig_id)[possible_bins[i][j]].maxPosition; + // x = start_pos; + // y = end_pos; + if(this->getIndex().linear_at(contig_id)[used_bins].minPosition < end_pos && + this->getIndex().linear_at(contig_id)[used_bins].maxPosition > start_pos) + { + yon_blocks.push_back(used_bins); + //std::cerr << "overlap" << std::endl; + } + //else { + // std::cerr << "no overlap" << std::endl; + //} + } + } + + // Return nothing if all empty + if(yon_blocks.size() == 0) + return(std::vector()); + + // Sort to dedupe + std::sort(yon_blocks.begin(), yon_blocks.end()); + + // Dedupe + std::vector yon_blocks_deduped; + yon_blocks_deduped.push_back(this->getIndex().linear_at(contig_id)[yon_blocks[0]]); + + for(U32 i = 1; i < yon_blocks.size(); ++i){ + if(yon_blocks[i] != yon_blocks_deduped.back().blockID - block_offset_start){ + yon_blocks_deduped.push_back(this->getIndex().linear_at(contig_id)[yon_blocks[i]]); + } + } + + // Debug + //std::cerr << "Final\n" << std::endl; + //for(U32 i = 0; i < yon_blocks_deduped.size(); ++i){ + // yon_blocks_deduped[i].print(std::cerr); + // std::cerr << std::endl; + //} + + return(yon_blocks_deduped); +} + +} +} diff --git a/lib/index/index.h b/lib/index/index.h index 8d5cd5b..c60dc0a 100644 --- a/lib/index/index.h +++ b/lib/index/index.h @@ -12,17 +12,22 @@ namespace index{ class Index{ private: - typedef Index self_type; - typedef std::size_t size_type; - typedef VariantIndex container_type; - typedef IndexMetaContainer container_meta_type; - typedef IndexEntry entry_type; - typedef IndexIndexEntry entry_meta_type; - typedef VariantIndexBin bin_type; + typedef Index self_type; + typedef std::size_t size_type; + typedef VariantIndex container_type; + typedef IndexMetaContainer container_meta_type; + typedef IndexEntry entry_type; + typedef IndexIndexEntry entry_meta_type; + typedef VariantIndexBin bin_type; + typedef YonContig contig_type; public: Index() : number_blocks(0){} - Index(const self_type& other) : number_blocks(other.number_blocks), index_meta_(other.index_meta_), index_(other.index_){} + Index(const self_type& other) : + number_blocks(other.number_blocks), + index_meta_(other.index_meta_), + index_(other.index_) + {} ~Index(){} /**< @@ -31,38 +36,20 @@ class Index{ * index entries all belonging to the same contig. * @return Returns TRUE upon success or FALSE otherwise */ - bool buildMetaIndex(void){ - // This criterion should never be satisfied - if(this->index_.size() == 0) - return false; - - for(U32 c = 0; c < this->index_.size(); ++c){ // foreach contig - if(this->index_.linear_at(c).size() == 0){ - this->index_meta_ += entry_meta_type(); - continue; - } - - entry_meta_type indexindex; - indexindex(this->index_.linear_at(c)[0]); // Start reference - for(U32 i = 1; i < this->index_.linear_at(c).size(); ++i){ - if(indexindex == this->index_.linear_at(c)[i]) // If the blocks share the same contig identifier - indexindex += this->index_.linear_at(c)[i]; - else { // Otherwise push one entry onto the chain and start a new reference - this->index_meta_ += indexindex; - indexindex(this->index_.linear_at(c)[i]); - } - } - - this->index_meta_ += indexindex; - } - - return true; - } + bool buildMetaIndex(void); // Capacity - inline const bool empty(void) const{ return(this->index_.empty()); } - const size_t size(void) const{ return(this->index_.size()); } - const size_t sizeMeta(void) const{ return(this->index_meta_.size()); } + inline bool empty(void) const{ return(this->index_.empty()); } + inline size_t size(void) const{ return(this->index_.size()); } + inline size_t sizeMeta(void) const{ return(this->index_meta_.size()); } + + uint64_t GetLinearSize(void) const{ + uint64_t n_total = 0; + for(U32 i = 0; i < this->index_.size(); ++i){ + n_total += this->index_.linear_[i].size(); + } + return(n_total); + } //inline void operator+=(const entry_type& entry){ this->index_ += entry; } //inline void operator+=(const entry_meta_type& entry){ this->index_meta_ += entry; } @@ -83,13 +70,7 @@ class Index{ * @param contig_id * @return */ - inline std::vector findOverlap(const U32& contig_id) const{ - if(contig_id > this->getMetaIndex().size()) - return(std::vector()); - - // Todo - return(std::vector()); - } + std::vector findOverlap(const U32& contig_id) const; /**< * Return interval of YON blocks overlapping target tuple (contigID, position, position) @@ -108,67 +89,17 @@ class Index{ * @param end_pos * @return */ - std::vector findOverlap(const U32& contig_id, const U64& start_pos, const U64& end_pos) const{ - if(contig_id > this->getMetaIndex().size()) - return(std::vector()); - - if(this->getMetaIndex().at(contig_id).n_blocks == 0) - return(std::vector()); - - // We also need to know possible overlaps in the quad-tree: - // Seek from root to origin in quad-tree for potential overlapping bins with counts > 0 - - // Retrieve vector of bins that might contain the data - // The possibleBins function does not check if they exist - std::vector possible_bins = this->index_[contig_id].possibleBins(start_pos, end_pos); - std::vector yon_blocks; - - // Check if possible bins exists in the linear index - for(U32 i = 0; i < possible_bins.size(); ++i){ - // Cycle over the YON blocks this bin have data mapping to - for(U32 j = 0; j < possible_bins[i].size(); ++j){ - // Check [a, b] overlaps with [x, y] iff b > x and a < y. - // a = this->getIndex().linear_at(contig_id)[possible_bins[i][j]].minPosition; - // b = this->getIndex().linear_at(contig_id)[possible_bins[i][j]].maxPosition; - // x = start_pos; - // y = end_pos; - if(this->getIndex().linear_at(contig_id)[possible_bins[i][j]].minPosition < end_pos && - this->getIndex().linear_at(contig_id)[possible_bins[i][j]].maxPosition > start_pos) - { - yon_blocks.push_back(possible_bins[i][j]); - } - } - } - - // Return nothing if all empty - if(yon_blocks.size() == 0) - return(std::vector()); - - // Sort to dedupe - std::sort(yon_blocks.begin(), yon_blocks.end()); - - // Dedupe - std::vector yon_blocks_deduped; - yon_blocks_deduped.push_back(this->getIndex().linear_at(contig_id)[yon_blocks[0]]); - - for(U32 i = 1; i < yon_blocks.size(); ++i){ - if(yon_blocks[i] != yon_blocks_deduped.back().blockID){ - yon_blocks_deduped.push_back(this->getIndex().linear_at(contig_id)[yon_blocks[i]]); - } - } - - // Debug - //for(U32 i = 0; i < yon_blocks_deduped.size(); ++i){ - // yon_blocks_deduped[i].print(std::cerr); - // std::cerr << std::endl; - //} - - return(yon_blocks_deduped); - } + std::vector findOverlap(const U32& contig_id, const U64& start_pos, const U64& end_pos) const; inline const U64& current_block_number(void) const{ return(this->number_blocks); } inline void operator++(void){ ++this->number_blocks; } + /**< + * Wrapper function for adding a list of contigs to the index + * @param contigs + */ + inline void Add(const std::vector& contigs){ this->index_.Add(contigs); } + private: friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ stream << entry.index_; diff --git a/lib/index/index_meta_container.h b/lib/index/index_meta_container.h index 4bb9d52..61bd423 100644 --- a/lib/index/index_meta_container.h +++ b/lib/index/index_meta_container.h @@ -82,7 +82,7 @@ class IndexMetaContainer{ inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } inline const size_type& capacity(void) const{ return(this->n_capacity); } diff --git a/lib/index/variant_index.cpp b/lib/index/variant_index.cpp new file mode 100644 index 0000000..6de30ad --- /dev/null +++ b/lib/index/variant_index.cpp @@ -0,0 +1,121 @@ +#include "variant_index.h" + +namespace tachyon{ +namespace index{ + +VariantIndex::VariantIndex() : + n_contigs_(0), + n_capacity_(1000), + contigs_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))), + linear_(static_cast(::operator new[](this->capacity()*sizeof(linear_type)))) +{ + +} + +VariantIndex::VariantIndex(const self_type& other) : + n_contigs_(other.n_contigs_), + n_capacity_(other.n_capacity_), + contigs_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))), + linear_(static_cast(::operator new[](this->capacity()*sizeof(linear_type)))) +{ + for(U32 i = 0; i < this->size(); ++i){ + new( &this->contigs_[i] ) value_type( other.contigs_[i] ); + new( &this->linear_[i] ) linear_type( other.linear_[i] ); + } +} + +VariantIndex::~VariantIndex(){ + for(std::size_t i = 0; i < this->size(); ++i){ + (this->contigs_ + i)->~VariantIndexContig(); + (this->linear_ + i)->~VariantIndexLinear(); + } + + ::operator delete[](static_cast(this->contigs_)); + ::operator delete[](static_cast(this->linear_)); +} + +void VariantIndex::resize(void){ + pointer temp = this->contigs_; + linear_type* temp_linear = this->linear_; + + this->n_capacity_ *= 2; + this->contigs_ = static_cast(::operator new[](this->capacity()*sizeof(value_type))); + this->linear_ = static_cast(::operator new[](this->capacity()*sizeof(linear_type))); + + + // Lift over values from old addresses + for(U32 i = 0; i < this->size(); ++i){ + new( &this->contigs_[i] ) value_type( temp[i] ); + new( &this->linear_[i] ) linear_type( temp_linear[i] ); + } + + // Clear temp + for(std::size_t i = 0; i < this->size(); ++i){ + (temp + i)->~VariantIndexContig(); + (temp_linear + i)->~VariantIndexLinear(); + } + + ::operator delete[](static_cast(temp)); + ::operator delete[](static_cast(temp_linear)); +} + +std::ostream& operator<<(std::ostream& stream, const VariantIndex& index){ + stream.write(reinterpret_cast(&index.n_contigs_), sizeof(std::size_t)); + std::size_t n_items_written = 0; + for(U32 i = 0; i < index.size(); ++i){ + if(index.contigs_[i].size_sites()) ++n_items_written; + } + stream.write(reinterpret_cast(&n_items_written), sizeof(std::size_t)); + + for(U32 i = 0; i < index.size(); ++i){ + // Write if contig[i] contains data + if(index.contigs_[i].size_sites()) + stream << index.contigs_[i]; + } + + // Write linear index + for(U32 i = 0; i < index.size(); ++i) stream << index.linear_[i]; + + return(stream); +} + +std::istream& operator>>(std::istream& stream, VariantIndex& index){ + // Clear old data + if(index.size()){ + for(std::size_t i = 0; i < index.size(); ++i){ + (index.contigs_ + i)->~VariantIndexContig(); + (index.linear_ + i)->~VariantIndexLinear(); + } + + ::operator delete[](static_cast(index.contigs_)); + ::operator delete[](static_cast(index.linear_)); + } + + stream.read(reinterpret_cast(&index.n_contigs_), sizeof(std::size_t)); + index.n_capacity_ = index.size() + 64; + std::size_t n_items_written = 0; + stream.read(reinterpret_cast(&n_items_written), sizeof(std::size_t)); + + // Allocate new data + index.contigs_ = static_cast(::operator new[](index.capacity()*sizeof(VariantIndexContig))); + index.linear_ = static_cast(::operator new[](index.capacity()*sizeof(VariantIndexLinear))); + for(U32 i = 0; i < index.size(); ++i) { + new( &index.contigs_[i] ) VariantIndexContig( ); + new( &index.linear_[i] ) VariantIndexLinear( ); + } + + // Load data and update accordingly + for(U32 i = 0; i < n_items_written; ++i){ + VariantIndexContig temp; + stream >> temp; // Read + index.at(temp.getContigID()) = temp; + } + + // Load linear index + for(U32 i = 0; i < index.size(); ++i) stream >> index.linear_[i]; + + return(stream); +} + +} +} diff --git a/lib/index/variant_index.h b/lib/index/variant_index.h index 217224c..c56296b 100644 --- a/lib/index/variant_index.h +++ b/lib/index/variant_index.h @@ -3,510 +3,17 @@ #include #include +#include +#include "variant_index_bin.h" +#include "variant_index_contig.h" #include "variant_index_linear.h" +#include "core/header/variant_header.h" +#include "io/vcf_utils.h" namespace tachyon{ namespace index{ -struct VariantIndexBin{ -private: - typedef VariantIndexBin self_type; - typedef std::size_t size_type; - typedef U32 value_type; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef value_type* pointer; - typedef const value_type* const_pointer; - -public: - VariantIndexBin() : - binID_(0), - n_variants_(0), - n_blocks_(0), - n_capacity_(100), - blocks_(new value_type[this->capacity()]) - { - - } - - VariantIndexBin(const self_type& other) : - binID_(other.binID_), - n_variants_(other.n_variants_), - n_blocks_(other.n_blocks_), - n_capacity_(other.n_capacity_), - blocks_(new value_type[this->capacity()]) - { - memcpy(this->blocks_, other.blocks_, sizeof(value_type)*other.n_blocks_); - } - - - VariantIndexBin& operator=(const self_type& other){ - delete [] this->blocks_; - this->blocks_ = new value_type[other.capacity()]; - this->binID_ = other.binID_; - this->n_blocks_ = other.n_blocks_; - this->n_capacity_ = other.n_capacity_; - for(U32 i = 0; i < this->size(); ++i) this->blocks_[i] = other.blocks_[i]; - - return(*this); - } - - inline bool operator<(const self_type& other) const{ return(this->binID_ < other.binID_); } - - ~VariantIndexBin(){ delete [] this->blocks_; } - - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - // Element access - inline reference at(const size_type& position){ return(this->blocks_[position]); } - inline const_reference at(const size_type& position) const{ return(this->blocks_[position]); } - inline reference operator[](const size_type& position){ return(this->blocks_[position]); } - inline const_reference operator[](const size_type& position) const{ return(this->blocks_[position]); } - inline pointer data(void){ return(this->blocks_); } - inline const_pointer data(void) const{ return(this->blocks_); } - inline reference front(void){ return(this->blocks_[0]); } - inline const_reference front(void) const{ return(this->blocks_[0]); } - inline reference back(void){ return(this->blocks_[this->n_blocks_ - 1]); } - inline const_reference back(void) const{ return(this->blocks_[this->n_blocks_ - 1]); } - - // Capacity - inline const bool empty(void) const{ return(this->n_blocks_ == 0); } - inline const size_type& size(void) const{ return(this->n_blocks_); } - inline const size_type& capacity(void) const{ return(this->n_capacity_); } - - // Iterator - inline iterator begin(){ return iterator(&this->blocks_[0]); } - inline iterator end(){ return iterator(&this->blocks_[this->n_blocks_]); } - inline const_iterator begin() const{ return const_iterator(&this->blocks_[0]); } - inline const_iterator end() const{ return const_iterator(&this->blocks_[this->n_blocks_]); } - inline const_iterator cbegin() const{ return const_iterator(&this->blocks_[0]); } - inline const_iterator cend() const{ return const_iterator(&this->blocks_[this->n_blocks_]); } - - // resize - void resize(){ - pointer old = this->blocks_; - this->n_capacity_ *= 2; - this->blocks_ = new value_type[this->capacity()*2]; - for(U32 i = 0; i < this->size(); ++i) this->blocks_[i] = old[i]; - delete [] old; - } - - /**< - * Update - * @param variant_block_number - */ - void Add(const U32& variant_block_number){ - if(this->size() + 1 >= this->capacity()) - this->resize(); - - if(this->size()){ // Has data - if(this->back() != variant_block_number) // check parity between previous tachyon block and current one - this->blocks_[this->n_blocks_++] = variant_block_number; - - ++this->n_variants_; - } else { // Empty - this->blocks_[this->n_blocks_++] = variant_block_number; - ++this->n_variants_; - } - } - -private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& bin){ - stream.write(reinterpret_cast(&bin.binID_), sizeof(U32)); - stream.write(reinterpret_cast(&bin.n_variants_), sizeof(U32)); - stream.write(reinterpret_cast(&bin.n_blocks_), sizeof(size_type)); - for(U32 i = 0; i < bin.size(); ++i) - stream.write(reinterpret_cast(&bin.blocks_[i]), sizeof(value_type)); - - return(stream); - } - - friend std::istream& operator>>(std::istream& stream, self_type& bin){ - delete [] bin.blocks_; - stream.read(reinterpret_cast(&bin.binID_), sizeof(U32)); - stream.read(reinterpret_cast(&bin.n_variants_), sizeof(U32)); - stream.read(reinterpret_cast(&bin.n_blocks_), sizeof(size_type)); - bin.n_capacity_ = bin.size() + 64; - bin.blocks_ = new value_type[bin.capacity()]; - - for(U32 i = 0; i < bin.size(); ++i) - stream.read(reinterpret_cast(&bin.blocks_[i]), sizeof(value_type)); - - return(stream); - } - -public: - U32 binID_; - U32 n_variants_; // number of variants belonging to this bin - size_type n_blocks_; - size_type n_capacity_; - pointer blocks_; // tachyon blocks belonging to this bin -}; - -class VariantIndexContig{ -private: - typedef VariantIndexContig self_type; - typedef std::size_t size_type; - typedef VariantIndexBin value_type; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef value_type* pointer; - typedef const value_type* const_pointer; - -public: - VariantIndexContig() : - contigID_(0), - l_contig_(0), - l_contig_rounded_(0), - n_bins_(0), - n_capacity_(0), - n_levels_(0), - n_sites_(0), - bins_cumsum_(nullptr), - bins_(nullptr) - { - - } - - VariantIndexContig(const U32 contigID, const U64 l_contig, const BYTE n_levels) : - contigID_(contigID), - l_contig_(l_contig), - l_contig_rounded_(0), - n_bins_(0), - n_capacity_(0), - n_levels_(n_levels), - n_sites_(0), - bins_cumsum_(nullptr), - bins_(nullptr) - { - this->l_contig_rounded_ = this->roundLengthClosestBase4_(this->l_contig_); - if(this->n_levels_ != 0){ - this->calculateCumulativeSums_(); - this->n_capacity_ = this->bins_cumsum_[this->n_levels_] + 64; - this->n_bins_ = this->bins_cumsum_[this->n_levels_]; - this->bins_ = static_cast(::operator new[](this->capacity()*sizeof(value_type))); - for(U32 i = 0; i < this->size(); ++i){ - new( &this->bins_[i] ) value_type( ); - this->bins_[i].binID_ = i; - } - } - } - - VariantIndexContig(const self_type& other) : - contigID_(other.contigID_), - l_contig_(other.l_contig_), - l_contig_rounded_(other.l_contig_rounded_), - n_bins_(other.n_bins_), - n_capacity_(other.n_capacity_), - n_levels_(other.n_levels_), - n_sites_(other.n_sites_), - bins_cumsum_(nullptr), - bins_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) - { - this->calculateCumulativeSums_(); - for(U32 i = 0; i < this->size(); ++i) - new( &this->bins_[i] ) value_type( other.bins_[i] ); - } - - void operator=(const self_type& other){ - // Clean previous - for(std::size_t i = 0; i < this->size(); ++i) - (this->bins_ + i)->~VariantIndexBin(); - - ::operator delete[](static_cast(this->bins_)); - delete [] this->bins_cumsum_; - - this->contigID_ = other.contigID_; - this->l_contig_ = other.l_contig_; - this->l_contig_rounded_ = other.l_contig_rounded_; - this->n_bins_ = other.n_bins_; - this->n_capacity_ = other.n_capacity_; - this->n_levels_ = other.n_levels_; - this->bins_cumsum_ = nullptr; - this->calculateCumulativeSums_(); - - this->bins_ = static_cast(::operator new[](this->capacity()*sizeof(value_type))); - for(U32 i = 0; i < this->size(); ++i) - new( &this->bins_[i] ) value_type( other.bins_[i] ); - } - - - ~VariantIndexContig(){ - for(std::size_t i = 0; i < this->size(); ++i) - (this->bins_ + i)->~VariantIndexBin(); - - ::operator delete[](static_cast(this->bins_)); - } - - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - // Element access - inline reference at(const size_type& position){ return(this->bins_[position]); } - inline const_reference at(const size_type& position) const{ return(this->bins_[position]); } - inline reference operator[](const size_type& position){ return(this->bins_[position]); } - inline const_reference operator[](const size_type& position) const{ return(this->bins_[position]); } - inline pointer data(void){ return(this->bins_); } - inline const_pointer data(void) const{ return(this->bins_); } - inline reference front(void){ return(this->bins_[0]); } - inline const_reference front(void) const{ return(this->bins_[0]); } - inline reference back(void){ return(this->bins_[this->n_bins_ - 1]); } - inline const_reference back(void) const{ return(this->bins_[this->n_bins_ - 1]); } - - // Capacity - inline const bool empty(void) const{ return(this->n_bins_ == 0); } - inline const size_type& size(void) const{ return(this->n_bins_); } - inline const size_type& capacity(void) const{ return(this->n_capacity_); } - inline const size_type& size_sites(void) const{ return(this->n_sites_); } - - // Iterator - inline iterator begin(){ return iterator(&this->bins_[0]); } - inline iterator end(){ return iterator(&this->bins_[this->n_bins_]); } - inline const_iterator begin() const{ return const_iterator(&this->bins_[0]); } - inline const_iterator end() const{ return const_iterator(&this->bins_[this->n_bins_]); } - inline const_iterator cbegin() const{ return const_iterator(&this->bins_[0]); } - inline const_iterator cend() const{ return const_iterator(&this->bins_[this->n_bins_]); } - - // Accessor - inline U32& getContigID(void){ return(this->contigID_); } - inline const U32& getContigID(void) const{ return(this->contigID_); } - - /**< - * Add a target interval tuple (from,to,block_ID) - * @param fromPosition From position of interval - * @param toPosition To position of interval - * @param yon_block_id Tachyon block ID (generally a cumulative integer) - * @return - */ - inline const S32 Add(const U64& fromPosition, const U64& toPosition, const U32& yon_block_id){ - for(S32 i = this->n_levels_; i != 0; --i){ - U32 binFrom = S64(fromPosition/(this->l_contig_rounded_ / pow(4,i))); - U32 binTo = S64(toPosition/(this->l_contig_rounded_ / pow(4,i))); - /** - * If both ends of the interval map into the same chunk we know the interval is - * completely contained: in this case we deposit the interval there - **/ - if(binFrom == binTo){ - //if(i != this->n_levels_) std::cerr << fromPosition << "->" << toPosition << ", adding to " << binFrom << " level " << i << " cum : " << this->bins_cumsum_[i-1]+binFrom << "/" << this->size() << std::endl; - ++this->n_sites_; - this->bins_[this->bins_cumsum_[i - 1]+binFrom].Add(yon_block_id); - return(this->bins_cumsum_[i - 1]+binFrom); - } - } - this->bins_[0].Add(yon_block_id); - ++this->n_sites_; - return(0); - } - - /**< - * Computes the possible bins an interval might overlap - * @param from_position From position of interval - * @param to_position To position of interval - * @return Returns a vector of viable overlapping bins - */ - std::vector possibleBins(const U64& from_position, const U64& to_position, const bool filter = true) const{ - std::vector overlapping_chunks; - //overlapping_chunks.push_back(this->at(0)); // level 0 - - // If end position are out-of-bounds then trucated it to maximum - // allowed value - U64 used_to_posititon = to_position; - if(used_to_posititon > this->l_contig_rounded_){ - //std::cerr << "out of bounds" << std::endl; - //std::cerr << to_position << "->" << this->l_contig_rounded_ << std::endl; - used_to_posititon = this->l_contig_rounded_; - } - - for(S32 i = this->n_levels_; i != 0; --i){ - S64 binFrom = S64(from_position/(this->l_contig_rounded_ / pow(4,i))); - S64 binTo = S64(used_to_posititon/(this->l_contig_rounded_ / pow(4,i))); - - //std::cerr << i << "/" << (int)this->n_levels_ << ": " << this->bins_cumsum_[i-1] << " + " << binFrom << " -> " << binTo << "/" << this->size() << std::endl; - //std::cerr << "limit: " << this->bins_cumsum_[i] << std::endl; - - // Overlap from cumpos + (binFrom, binTo) - // All these chunks could potentially hold intervals overlapping - // the desired coordinates - for(U32 j = binFrom; j <= binTo; ++j){ - if(filter == false) - overlapping_chunks.push_back(this->at(this->bins_cumsum_[i - 1] + j)); - else { - if(this->at(this->bins_cumsum_[i - 1] + j).size()) - overlapping_chunks.push_back(this->at(this->bins_cumsum_[i - 1] + j)); - } - } - } - overlapping_chunks.push_back(this->at(0)); - - return(overlapping_chunks); - } - -private: - /**< - * Round target integer up to the closest number divisible by 4 - * @param length Input integer start value - * @return Return a target integer divisible by 4 - */ - inline U64 roundLengthClosestBase4_(const U64& length) const{ - return( ( pow(4,this->n_levels_) - (length % (U64)pow(4,this->n_levels_)) ) + length ); - } - - /**< - * Pre-calculate the cumulative distribution of 4^(0:levels-1). - * These values are used to find the array offset for levels > 0 - */ - void calculateCumulativeSums_(void){ - if(this->n_levels_ == 0) return; - - delete [] this->bins_cumsum_; - this->bins_cumsum_ = new U32[this->n_levels_ + 1]; // inclusive last - - U32 total = 0; - for(U32 i = 0; i <= this->n_levels_; ++i){ - total += pow(4,i); - this->bins_cumsum_[i] = total - 1; // remove 0 to start relative zero - } - } - - friend std::ostream& operator<<(std::ostream& stream, const self_type& contig){ - stream.write(reinterpret_cast(&contig.contigID_), sizeof(U32)); - stream.write(reinterpret_cast(&contig.l_contig_), sizeof(U64)); - stream.write(reinterpret_cast(&contig.l_contig_rounded_), sizeof(U64)); - stream.write(reinterpret_cast(&contig.n_bins_), sizeof(size_type)); - stream.write(reinterpret_cast(&contig.n_levels_), sizeof(BYTE)); - stream.write(reinterpret_cast(&contig.n_sites_), sizeof(size_type)); - - size_type n_items_written = 0; - for(U32 i = 0; i < contig.size(); ++i){ - if(contig.bins_[i].size()) ++n_items_written; - } - stream.write(reinterpret_cast(&n_items_written), sizeof(size_type)); - - for(U32 i = 0; i < contig.size(); ++i){ - // If bins[i] contains data - if(contig.bins_[i].size()) - stream << contig.bins_[i]; - } - return(stream); - } - - friend std::istream& operator>>(std::istream& stream, self_type& contig){ - // Clear old data - if(contig.size()){ - for(std::size_t i = 0; i < contig.size(); ++i) - (contig.bins_ + i)->~VariantIndexBin(); - - ::operator delete[](static_cast(contig.bins_)); - } - - delete [] contig.bins_cumsum_; - contig.bins_cumsum_ = nullptr; - - stream.read(reinterpret_cast(&contig.contigID_), sizeof(U32)); - stream.read(reinterpret_cast(&contig.l_contig_), sizeof(U64)); - stream.read(reinterpret_cast(&contig.l_contig_rounded_), sizeof(U64)); - stream.read(reinterpret_cast(&contig.n_bins_), sizeof(size_type)); - stream.read(reinterpret_cast(&contig.n_levels_), sizeof(BYTE)); - stream.read(reinterpret_cast(&contig.n_sites_), sizeof(size_type)); - contig.n_capacity_ = contig.n_bins_ + 64; - size_type n_items_written = 0; - stream.read(reinterpret_cast(&n_items_written), sizeof(size_type)); - - // Allocate new - contig.bins_ = static_cast(::operator new[](contig.capacity()*sizeof(value_type))); - for(U32 i = 0; i < contig.size(); ++i){ - new( &contig.bins_[i] ) value_type( ); - contig.bins_[i].binID_ = i; - } - contig.calculateCumulativeSums_(); - - // Load data accordingly - for(U32 i = 0; i < n_items_written; ++i){ - value_type temp; - stream >> temp; - //std::cerr << "loading: " << temp.size() << " entries" << std::endl; - contig.bins_[temp.binID_] = temp; - } - //std::cerr << std::endl; - return(stream); - } - -private: - U32 contigID_; - U64 l_contig_; // as described in header - U64 l_contig_rounded_; // rounded up to next base-4 - size_type n_bins_; - size_type n_capacity_; - BYTE n_levels_; // 7 by default - size_type n_sites_; - U32* bins_cumsum_; // 1, 1+4, 1+4+16, 1+4+16+64, ... - pointer bins_; // bin information -}; - class VariantIndex{ private: typedef VariantIndex self_type; @@ -518,38 +25,12 @@ class VariantIndex{ typedef value_type* pointer; typedef const value_type* const_pointer; typedef IndexEntry linear_entry_type; + typedef YonContig contig_type; public: - VariantIndex() : - n_contigs_(0), - n_capacity_(1000), - contigs_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))), - linear_(static_cast(::operator new[](this->capacity()*sizeof(linear_type)))) - { - - } - - VariantIndex(const self_type& other) : - n_contigs_(other.n_contigs_), - n_capacity_(other.n_capacity_), - contigs_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))), - linear_(static_cast(::operator new[](this->capacity()*sizeof(linear_type)))) - { - for(U32 i = 0; i < this->size(); ++i){ - new( &this->contigs_[i] ) value_type( other.contigs_[i] ); - new( &this->linear_[i] ) linear_type( other.linear_[i] ); - } - } - - ~VariantIndex(){ - for(std::size_t i = 0; i < this->size(); ++i){ - (this->contigs_ + i)->~VariantIndexContig(); - (this->linear_ + i)->~VariantIndexLinear(); - } - - ::operator delete[](static_cast(this->contigs_)); - ::operator delete[](static_cast(this->linear_)); - } + VariantIndex(); + VariantIndex(const self_type& other); + ~VariantIndex();; class iterator{ private: @@ -601,7 +82,7 @@ class VariantIndex{ inline const linear_type& linear_at(const size_type& contig_id) const{ return(this->linear_[contig_id]); } // Capacity - inline const bool empty(void) const{ return(this->n_contigs_ == 0); } + inline bool empty(void) const{ return(this->n_contigs_ == 0); } inline const size_type& size(void) const{ return(this->n_contigs_); } inline const size_type& capacity(void) const{ return(this->n_capacity_); } @@ -613,13 +94,40 @@ class VariantIndex{ inline const_iterator cbegin() const{ return const_iterator(&this->contigs_[0]); } inline const_iterator cend() const{ return const_iterator(&this->contigs_[this->n_contigs_]); } + self_type& Add(const std::vector& contigs){ + while(this->size() + contigs.size() + 1 >= this->n_capacity_) + this->resize(); + + for(U32 i = 0; i < contigs.size(); ++i){ + const U64 contig_length = contigs[i].n_bases; + BYTE n_levels = 7; + U64 bins_lowest = pow(4,n_levels); + double used = ( bins_lowest - (contig_length % bins_lowest) ) + contig_length; + + if(used / bins_lowest < 2500){ + for(S32 i = n_levels; i != 0; --i){ + if(used/pow(4,i) > 2500){ + n_levels = i; + break; + } + } + } + + this->Add(i, contig_length, n_levels); + //std::cerr << "contig: " << this->header->contigs[i].name << "(" << i << ")" << " -> " << contig_length << " levels: " << (int)n_levels << std::endl; + //std::cerr << "idx size:" << idx.size() << " at " << this->writer->index.variant_index_[i].size() << std::endl; + //std::cerr << i << "->" << this->header->contigs[i].name << ":" << contig_length << " up to " << (U64)used << " width (bp) lowest level: " << used/pow(4,n_levels) << "@level: " << (int)n_levels << std::endl; + } + return(*this); + } + /**< * Add a contig with n_levels to the chain * @param l_contig Length of contig * @param n_levels Number of desired 4^N levels * @return Returns a reference of self */ - inline self_type& add(const U32& contigID, const U64& l_contig, const BYTE& n_levels){ + inline self_type& Add(const U32& contigID, const U64& l_contig, const BYTE& n_levels){ if(this->size() + 1 >= this->n_capacity_) this->resize(); @@ -635,7 +143,7 @@ class VariantIndex{ * @param entry Target index entry to push back onto the linear index vector * @return Returns a reference of self */ - inline self_type& add(const U32& contigID, const linear_entry_type& entry){ + inline self_type& Add(const U32& contigID, const linear_entry_type& entry){ this->linear_[contigID] += entry; return(*this); } @@ -644,91 +152,13 @@ class VariantIndex{ * Resizes the index to accept more contigs than currently allocated * memory for. Resizes for the quad-tree index and the linear index */ - void resize(void){ - pointer temp = this->contigs_; - linear_type* temp_linear = this->linear_; - - this->n_capacity_ *= 2; - this->contigs_ = static_cast(::operator new[](this->capacity()*sizeof(value_type))); - this->linear_ = static_cast(::operator new[](this->capacity()*sizeof(linear_type))); - - - // Lift over values from old addresses - for(U32 i = 0; i < this->size(); ++i){ - new( &this->contigs_[i] ) value_type( temp[i] ); - new( &this->linear_[i] ) linear_type( temp_linear[i] ); - } - - // Clear temp - for(std::size_t i = 0; i < this->size(); ++i){ - (temp + i)->~VariantIndexContig(); - (temp_linear + i)->~VariantIndexLinear(); - } - - ::operator delete[](static_cast(temp)); - ::operator delete[](static_cast(temp_linear)); - } + void resize(void); private: - friend std::ostream& operator<<(std::ostream& stream, const self_type& index){ - stream.write(reinterpret_cast(&index.n_contigs_), sizeof(size_type)); - size_type n_items_written = 0; - for(U32 i = 0; i < index.size(); ++i){ - if(index.contigs_[i].size_sites()) ++n_items_written; - } - stream.write(reinterpret_cast(&n_items_written), sizeof(size_type)); - - for(U32 i = 0; i < index.size(); ++i){ - // Write if contig[i] contains data - if(index.contigs_[i].size_sites()) - stream << index.contigs_[i]; - } - - // Write linear index - for(U32 i = 0; i < index.size(); ++i) stream << index.linear_[i]; - - return(stream); - } + friend std::ostream& operator<<(std::ostream& stream, const self_type& index); + friend std::istream& operator>>(std::istream& stream, self_type& index); - friend std::istream& operator>>(std::istream& stream, self_type& index){ - // Clear old data - if(index.size()){ - for(std::size_t i = 0; i < index.size(); ++i){ - (index.contigs_ + i)->~VariantIndexContig(); - (index.linear_ + i)->~VariantIndexLinear(); - } - - ::operator delete[](static_cast(index.contigs_)); - ::operator delete[](static_cast(index.linear_)); - } - - stream.read(reinterpret_cast(&index.n_contigs_), sizeof(size_type)); - index.n_capacity_ = index.size() + 64; - size_type n_items_written = 0; - stream.read(reinterpret_cast(&n_items_written), sizeof(size_type)); - - // Allocate new data - index.contigs_ = static_cast(::operator new[](index.capacity()*sizeof(value_type))); - index.linear_ = static_cast(::operator new[](index.capacity()*sizeof(linear_type))); - for(U32 i = 0; i < index.size(); ++i) { - new( &index.contigs_[i] ) value_type( ); - new( &index.linear_[i] ) linear_type( ); - } - - // Load data and update accordingly - for(U32 i = 0; i < n_items_written; ++i){ - value_type temp; - stream >> temp; // Read - index.at(temp.getContigID()) = temp; - } - - // Load linear index - for(U32 i = 0; i < index.size(); ++i) stream >> index.linear_[i]; - - return(stream); - } - -private: +public: size_type n_contigs_; // number of contigs size_type n_capacity_; pointer contigs_; diff --git a/lib/index/variant_index_bin.h b/lib/index/variant_index_bin.h new file mode 100644 index 0000000..61e45e2 --- /dev/null +++ b/lib/index/variant_index_bin.h @@ -0,0 +1,195 @@ +#ifndef INDEX_VARIANT_INDEX_BIN_H_ +#define INDEX_VARIANT_INDEX_BIN_H_ + +#include +#include + +#include "support/type_definitions.h" + +namespace tachyon{ +namespace index{ + +struct VariantIndexBin{ +private: + typedef VariantIndexBin self_type; + typedef std::size_t size_type; + typedef U32 value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + +public: + VariantIndexBin() : + binID_(0), + n_variants_(0), + n_blocks_(0), + n_capacity_(100), + blocks_(new value_type[this->capacity()]) + { + + } + + VariantIndexBin(const self_type& other) : + binID_(other.binID_), + n_variants_(other.n_variants_), + n_blocks_(other.n_blocks_), + n_capacity_(other.n_capacity_), + blocks_(new value_type[this->capacity()]) + { + memcpy(this->blocks_, other.blocks_, sizeof(value_type)*other.n_blocks_); + } + + + VariantIndexBin& operator=(const self_type& other){ + delete [] this->blocks_; + this->blocks_ = new value_type[other.capacity()]; + this->binID_ = other.binID_; + this->n_blocks_ = other.n_blocks_; + this->n_capacity_ = other.n_capacity_; + for(U32 i = 0; i < this->size(); ++i) this->blocks_[i] = other.blocks_[i]; + + return(*this); + } + + inline bool operator<(const self_type& other) const{ return(this->binID_ < other.binID_); } + + ~VariantIndexBin(){ delete [] this->blocks_; } + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->blocks_[position]); } + inline const_reference at(const size_type& position) const{ return(this->blocks_[position]); } + inline reference operator[](const size_type& position){ return(this->blocks_[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->blocks_[position]); } + inline pointer data(void){ return(this->blocks_); } + inline const_pointer data(void) const{ return(this->blocks_); } + inline reference front(void){ return(this->blocks_[0]); } + inline const_reference front(void) const{ return(this->blocks_[0]); } + inline reference back(void){ return(this->blocks_[this->n_blocks_ - 1]); } + inline const_reference back(void) const{ return(this->blocks_[this->n_blocks_ - 1]); } + + // Capacity + inline bool empty(void) const{ return(this->n_blocks_ == 0); } + inline const size_type& size(void) const{ return(this->n_blocks_); } + inline const size_type& capacity(void) const{ return(this->n_capacity_); } + + // Iterator + inline iterator begin(){ return iterator(&this->blocks_[0]); } + inline iterator end(){ return iterator(&this->blocks_[this->n_blocks_]); } + inline const_iterator begin() const{ return const_iterator(&this->blocks_[0]); } + inline const_iterator end() const{ return const_iterator(&this->blocks_[this->n_blocks_]); } + inline const_iterator cbegin() const{ return const_iterator(&this->blocks_[0]); } + inline const_iterator cend() const{ return const_iterator(&this->blocks_[this->n_blocks_]); } + + // resize + void resize(){ + pointer old = this->blocks_; + this->n_capacity_ *= 2; + this->blocks_ = new value_type[this->capacity()*2]; + for(U32 i = 0; i < this->size(); ++i) this->blocks_[i] = old[i]; + delete [] old; + } + + /**< + * Update + * @param variant_block_number + */ + void add(const U32& variant_block_number){ + if(this->size() + 1 >= this->capacity()) + this->resize(); + + if(this->size()){ // Has data + if(this->back() != variant_block_number) // check parity between previous tachyon block and current one + this->blocks_[this->n_blocks_++] = variant_block_number; + + ++this->n_variants_; + } else { // Empty + this->blocks_[this->n_blocks_++] = variant_block_number; + ++this->n_variants_; + } + } + + std::ostream& print(std::ostream& stream){ + stream << "ID: " << this->binID_ << ", variants: " << this->n_variants_ << ", associated blocks: " << this->n_blocks_; + if(this->size()){ + stream << ", yon-blocks ids: " << this->blocks_[0]; + for(U32 i = 1; i < this->size(); ++i) + stream << ',' << this->blocks_[i]; + } + + return(stream); + } + +private: + friend std::ostream& operator<<(std::ostream& stream, const self_type& bin){ + stream.write(reinterpret_cast(&bin.binID_), sizeof(U32)); + stream.write(reinterpret_cast(&bin.n_variants_), sizeof(U32)); + stream.write(reinterpret_cast(&bin.n_blocks_), sizeof(size_type)); + for(U32 i = 0; i < bin.size(); ++i) + stream.write(reinterpret_cast(&bin.blocks_[i]), sizeof(value_type)); + + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, self_type& bin){ + delete [] bin.blocks_; + stream.read(reinterpret_cast(&bin.binID_), sizeof(U32)); + stream.read(reinterpret_cast(&bin.n_variants_), sizeof(U32)); + stream.read(reinterpret_cast(&bin.n_blocks_), sizeof(size_type)); + bin.n_capacity_ = bin.size() + 64; + bin.blocks_ = new value_type[bin.capacity()]; + + for(U32 i = 0; i < bin.size(); ++i) + stream.read(reinterpret_cast(&bin.blocks_[i]), sizeof(value_type)); + + return(stream); + } + +public: + U32 binID_; + U32 n_variants_; // number of variants belonging to this bin + size_type n_blocks_; + size_type n_capacity_; + pointer blocks_; // tachyon blocks belonging to this bin +}; + +} +} + + + +#endif /* INDEX_VARIANT_INDEX_BIN_H_ */ diff --git a/lib/index/variant_index_contig.cpp b/lib/index/variant_index_contig.cpp new file mode 100644 index 0000000..f410ebb --- /dev/null +++ b/lib/index/variant_index_contig.cpp @@ -0,0 +1,151 @@ +#include + +#include "variant_index_contig.h" + +namespace tachyon{ +namespace index{ + +VariantIndexContig::VariantIndexContig() : + contigID_(0), + l_contig_(0), + l_contig_rounded_(0), + n_bins_(0), + n_capacity_(0), + n_levels_(0), + n_sites_(0), + bins_cumsum_(nullptr), + bins_(nullptr) +{ + +} + +VariantIndexContig::VariantIndexContig(const U32 contigID, const U64 l_contig, const BYTE n_levels) : + contigID_(contigID), + l_contig_(l_contig), + l_contig_rounded_(0), + n_bins_(0), + n_capacity_(0), + n_levels_(n_levels), + n_sites_(0), + bins_cumsum_(nullptr), + bins_(nullptr) +{ + this->l_contig_rounded_ = this->roundLengthClosestBase4_(this->l_contig_); + if(this->n_levels_ != 0){ + this->calculateCumulativeSums_(); + this->n_capacity_ = this->bins_cumsum_[this->n_levels_] + 64; + this->n_bins_ = this->bins_cumsum_[this->n_levels_]; + this->bins_ = static_cast(::operator new[](this->capacity()*sizeof(value_type))); + for(U32 i = 0; i < this->size(); ++i){ + new( &this->bins_[i] ) value_type( ); + this->bins_[i].binID_ = i; + } + } +} + +VariantIndexContig::VariantIndexContig(const self_type& other) : + contigID_(other.contigID_), + l_contig_(other.l_contig_), + l_contig_rounded_(other.l_contig_rounded_), + n_bins_(other.n_bins_), + n_capacity_(other.n_capacity_), + n_levels_(other.n_levels_), + n_sites_(other.n_sites_), + bins_cumsum_(nullptr), + bins_(static_cast(::operator new[](this->capacity()*sizeof(value_type)))) +{ + this->calculateCumulativeSums_(); + for(U32 i = 0; i < this->size(); ++i) + new( &this->bins_[i] ) value_type( other.bins_[i] ); +} + +void VariantIndexContig::operator=(const self_type& other){ + // Clean previous + for(std::size_t i = 0; i < this->size(); ++i) + (this->bins_ + i)->~VariantIndexBin(); + + ::operator delete[](static_cast(this->bins_)); + delete [] this->bins_cumsum_; + + this->contigID_ = other.contigID_; + this->l_contig_ = other.l_contig_; + this->l_contig_rounded_ = other.l_contig_rounded_; + this->n_bins_ = other.n_bins_; + this->n_capacity_ = other.n_capacity_; + this->n_levels_ = other.n_levels_; + this->bins_cumsum_ = nullptr; + this->calculateCumulativeSums_(); + + this->bins_ = static_cast(::operator new[](this->capacity()*sizeof(value_type))); + for(U32 i = 0; i < this->size(); ++i) + new( &this->bins_[i] ) value_type( other.bins_[i] ); +} + + +VariantIndexContig::~VariantIndexContig(){ + for(std::size_t i = 0; i < this->size(); ++i) + (this->bins_ + i)->~VariantIndexBin(); + + ::operator delete[](static_cast(this->bins_)); +} + +S32 VariantIndexContig::add(const U64& fromPosition, const U64& toPosition, const U32& yon_block_id){ + for(S32 i = this->n_levels_; i != 0; --i){ + U32 binFrom = S64(fromPosition/(this->l_contig_rounded_ / pow(4,i))); + U32 binTo = S64(toPosition/(this->l_contig_rounded_ / pow(4,i))); + /** + * If both ends of the interval map into the same chunk we know the interval is + * completely contained: in this case we deposit the interval there + **/ + if(binFrom == binTo){ + //if(i != this->n_levels_) std::cerr << fromPosition << "->" << toPosition << ", adding to " << binFrom << " level " << i << " cum : " << this->bins_cumsum_[i-1]+binFrom << "/" << this->size() << std::endl; + ++this->n_sites_; + this->bins_[this->bins_cumsum_[i - 1]+binFrom].add(yon_block_id); + return(this->bins_cumsum_[i - 1]+binFrom); + } + } + this->bins_[0].add(yon_block_id); + ++this->n_sites_; + return(0); +} + +std::vector VariantIndexContig::possibleBins(const U64& from_position, const U64& to_position, const bool filter) const{ + std::vector overlapping_chunks; + //overlapping_chunks.push_back(this->at(0)); // level 0 + + // If end position are out-of-bounds then trucated it to maximum + // allowed value + U64 used_to_posititon = to_position; + if(used_to_posititon > this->l_contig_rounded_){ + //std::cerr << "out of bounds" << std::endl; + //std::cerr << to_position << "->" << this->l_contig_rounded_ << std::endl; + used_to_posititon = this->l_contig_rounded_; + } + + for(S32 i = this->n_levels_; i != 0; --i){ + S64 binFrom = S64(from_position/(this->l_contig_rounded_ / pow(4,i))); + S64 binTo = S64(used_to_posititon/(this->l_contig_rounded_ / pow(4,i))); + + //std::cerr << i << "/" << (int)this->n_levels_ << ": level offset " << this->bins_cumsum_[i-1] << "; (from, to) " << binFrom << " -> " << binTo << " out of " << this->size() << std::endl; + //std::cerr << "limit: " << this->bins_cumsum_[i] << std::endl; + + // Overlap from cumpos + (binFrom, binTo) + // All these chunks could potentially hold intervals overlapping + // the desired coordinates + for(U32 j = binFrom; j <= binTo; ++j){ + if(filter == false) + overlapping_chunks.push_back(this->at(this->bins_cumsum_[i - 1] + j)); + else { + if(this->at(this->bins_cumsum_[i - 1] + j).size()) + overlapping_chunks.push_back(this->at(this->bins_cumsum_[i - 1] + j)); + } + } + } + overlapping_chunks.push_back(this->at(0)); + + + return(overlapping_chunks); +} + +} +} diff --git a/lib/index/variant_index_contig.h b/lib/index/variant_index_contig.h new file mode 100644 index 0000000..6c68eb8 --- /dev/null +++ b/lib/index/variant_index_contig.h @@ -0,0 +1,219 @@ +#ifndef INDEX_VARIANT_INDEX_CONTIG_H_ +#define INDEX_VARIANT_INDEX_CONTIG_H_ + +#include +#include +#include + +#include "support/type_definitions.h" +#include "variant_index_bin.h" + +namespace tachyon{ +namespace index{ + +class VariantIndexContig{ +private: + typedef VariantIndexContig self_type; + typedef std::size_t size_type; + typedef VariantIndexBin value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + +public: + VariantIndexContig(); + VariantIndexContig(const U32 contigID, const U64 l_contig, const BYTE n_levels); + VariantIndexContig(const self_type& other); + void operator=(const self_type& other); + ~VariantIndexContig(); + + class iterator{ + private: + typedef iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + reference operator*() const{ return *ptr_; } + pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + class const_iterator{ + private: + typedef const_iterator self_type; + typedef std::forward_iterator_tag iterator_category; + + public: + const_iterator(pointer ptr) : ptr_(ptr) { } + void operator++() { ptr_++; } + void operator++(int junk) { ptr_++; } + const_reference operator*() const{ return *ptr_; } + const_pointer operator->() const{ return ptr_; } + bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } + bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } + private: + pointer ptr_; + }; + + // Element access + inline reference at(const size_type& position){ return(this->bins_[position]); } + inline const_reference at(const size_type& position) const{ return(this->bins_[position]); } + inline reference operator[](const size_type& position){ return(this->bins_[position]); } + inline const_reference operator[](const size_type& position) const{ return(this->bins_[position]); } + inline pointer data(void){ return(this->bins_); } + inline const_pointer data(void) const{ return(this->bins_); } + inline reference front(void){ return(this->bins_[0]); } + inline const_reference front(void) const{ return(this->bins_[0]); } + inline reference back(void){ return(this->bins_[this->n_bins_ - 1]); } + inline const_reference back(void) const{ return(this->bins_[this->n_bins_ - 1]); } + + // Capacity + inline bool empty(void) const{ return(this->n_bins_ == 0); } + inline const size_type& size(void) const{ return(this->n_bins_); } + inline const size_type& capacity(void) const{ return(this->n_capacity_); } + inline const size_type& size_sites(void) const{ return(this->n_sites_); } + + // Iterator + inline iterator begin(){ return iterator(&this->bins_[0]); } + inline iterator end(){ return iterator(&this->bins_[this->n_bins_]); } + inline const_iterator begin() const{ return const_iterator(&this->bins_[0]); } + inline const_iterator end() const{ return const_iterator(&this->bins_[this->n_bins_]); } + inline const_iterator cbegin() const{ return const_iterator(&this->bins_[0]); } + inline const_iterator cend() const{ return const_iterator(&this->bins_[this->n_bins_]); } + + // Accessor + inline U32& getContigID(void){ return(this->contigID_); } + inline const U32& getContigID(void) const{ return(this->contigID_); } + + /**< + * Add a target interval tuple (from,to,block_ID) + * @param fromPosition From position of interval + * @param toPosition To position of interval + * @param yon_block_id Tachyon block ID (generally a cumulative integer) + * @return + */ + S32 add(const U64& fromPosition, const U64& toPosition, const U32& yon_block_id); + + /**< + * Computes the possible bins an interval might overlap + * @param from_position From position of interval + * @param to_position To position of interval + * @return Returns a vector of viable overlapping bins + */ + std::vector possibleBins(const U64& from_position, const U64& to_position, const bool filter = true) const; + +private: + /**< + * Round target integer up to the closest number divisible by 4 + * @param length Input integer start value + * @return Return a target integer divisible by 4 + */ + inline U64 roundLengthClosestBase4_(const U64& length) const{ + return( ( pow(4,this->n_levels_) - (length % (U64)pow(4,this->n_levels_)) ) + length ); + } + + /**< + * Pre-calculate the cumulative distribution of 4^(0:levels-1). + * These values are used to find the array offset for levels > 0 + */ + void calculateCumulativeSums_(void){ + if(this->n_levels_ == 0) return; + + delete [] this->bins_cumsum_; + this->bins_cumsum_ = new U32[this->n_levels_ + 1]; // inclusive last + + U32 total = 0; + for(U32 i = 0; i <= this->n_levels_; ++i){ + total += pow(4,i); + this->bins_cumsum_[i] = total - 1; // remove 0 to start relative zero + } + } + + friend std::ostream& operator<<(std::ostream& stream, const self_type& contig){ + stream.write(reinterpret_cast(&contig.contigID_), sizeof(U32)); + stream.write(reinterpret_cast(&contig.l_contig_), sizeof(U64)); + stream.write(reinterpret_cast(&contig.l_contig_rounded_), sizeof(U64)); + stream.write(reinterpret_cast(&contig.n_bins_), sizeof(size_type)); + stream.write(reinterpret_cast(&contig.n_levels_), sizeof(BYTE)); + stream.write(reinterpret_cast(&contig.n_sites_), sizeof(size_type)); + + size_type n_items_written = 0; + for(U32 i = 0; i < contig.size(); ++i){ + if(contig.bins_[i].size()) ++n_items_written; + } + stream.write(reinterpret_cast(&n_items_written), sizeof(size_type)); + + for(U32 i = 0; i < contig.size(); ++i){ + // If bins[i] contains data + if(contig.bins_[i].size()) + stream << contig.bins_[i]; + } + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, self_type& contig){ + // Clear old data + if(contig.size()){ + for(std::size_t i = 0; i < contig.size(); ++i) + (contig.bins_ + i)->~VariantIndexBin(); + + ::operator delete[](static_cast(contig.bins_)); + } + + delete [] contig.bins_cumsum_; + contig.bins_cumsum_ = nullptr; + + stream.read(reinterpret_cast(&contig.contigID_), sizeof(U32)); + stream.read(reinterpret_cast(&contig.l_contig_), sizeof(U64)); + stream.read(reinterpret_cast(&contig.l_contig_rounded_), sizeof(U64)); + stream.read(reinterpret_cast(&contig.n_bins_), sizeof(size_type)); + stream.read(reinterpret_cast(&contig.n_levels_), sizeof(BYTE)); + stream.read(reinterpret_cast(&contig.n_sites_), sizeof(size_type)); + contig.n_capacity_ = contig.n_bins_ + 64; + size_type n_items_written = 0; + stream.read(reinterpret_cast(&n_items_written), sizeof(size_type)); + + // Allocate new + contig.bins_ = static_cast(::operator new[](contig.capacity()*sizeof(value_type))); + for(U32 i = 0; i < contig.size(); ++i){ + new( &contig.bins_[i] ) value_type( ); + contig.bins_[i].binID_ = i; + } + contig.calculateCumulativeSums_(); + + // Load data accordingly + for(U32 i = 0; i < n_items_written; ++i){ + value_type temp; + stream >> temp; + //std::cerr << "loading: " << temp.size() << " entries" << std::endl; + contig.bins_[temp.binID_] = temp; + } + //std::cerr << std::endl; + return(stream); + } + +private: + U32 contigID_; + U64 l_contig_; // as described in header + U64 l_contig_rounded_; // rounded up to next base-4 + size_type n_bins_; + size_type n_capacity_; + BYTE n_levels_; // 7 by default + size_type n_sites_; + U32* bins_cumsum_; // 1, 1+4, 1+4+16, 1+4+16+64, ... + pointer bins_; // bin information +}; + +} +} + + + +#endif /* INDEX_VARIANT_INDEX_CONTIG_H_ */ diff --git a/lib/index/variant_index_linear.h b/lib/index/variant_index_linear.h index 34faa02..25a86b9 100644 --- a/lib/index/variant_index_linear.h +++ b/lib/index/variant_index_linear.h @@ -91,7 +91,7 @@ class VariantIndexLinear{ inline const_reference back(void) const{ return(this->__entries[this->n_entries - 1]); } // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } + inline bool empty(void) const{ return(this->n_entries == 0); } inline const size_type& size(void) const{ return(this->n_entries); } inline const size_type& capacity(void) const{ return(this->n_capacity); } @@ -103,14 +103,15 @@ class VariantIndexLinear{ inline const_iterator cbegin() const{ return const_iterator(&this->__entries[0]); } inline const_iterator cend() const{ return const_iterator(&this->__entries[this->n_entries]); } - inline self_type& operator+=(const const_reference index_entry){ + inline self_type& operator+=(const_reference index_entry){ if(this->size() + 1 == this->n_capacity) this->resize(); this->__entries[this->n_entries++] = index_entry; return(*this); } - inline self_type& add(const const_reference index_entry){ return(*this += index_entry); } + + inline self_type& add(const_reference index_entry){ return(*this += index_entry); } void resize(void){ pointer temp = this->__entries; diff --git a/lib/io/BasicWriters.h b/lib/io/BasicWriters.h deleted file mode 100644 index 03d0db6..0000000 --- a/lib/io/BasicWriters.h +++ /dev/null @@ -1,168 +0,0 @@ -#ifndef TOMAHAWK_BASICWRITERS_H_ -#define TOMAHAWK_BASICWRITERS_H_ - -#include -#include -#include - -#include "algorithm/spinlock.h" -#include "basic_buffer.h" -#include "support/helpers.h" -#include "support/MagicConstants.h" - -namespace tachyon { -namespace io{ - -class GenericWriterInterace { -protected: - typedef io::BasicBuffer buffer_type; - typedef algorithm::SpinLock lock_type; - -public: - enum type {cout, file}; - enum compression {natural, binary}; - -public: - GenericWriterInterace(){} - virtual ~GenericWriterInterace(){} - - virtual bool open(void) =0; - virtual bool open(const std::string output) =0; - - // Always the same but contents in buffer may be different - virtual void operator<<(const buffer_type& buffer) =0; - virtual void operator<<(void* entry) =0; - - virtual const size_t write(const char* data, const U64& length) =0; - virtual std::ostream& getStream(void) =0; - virtual void flush(void) =0; - virtual bool close(void) =0; - inline lock_type* getLock(void){ return(&this->lock); } - - virtual inline const size_t writeNoLock(const char* data, const U32 length) =0; - virtual inline const size_t writeNoLock(const buffer_type& buffer) =0; - -protected: - lock_type lock; -}; - -class WriterStandardOut : public GenericWriterInterace{ -public: - WriterStandardOut(){} - ~WriterStandardOut(){ - // Flush upon termination - this->flush(); - } - - bool open(void){ return true; } - bool open(const std::string output){ - std::cerr << utility::timestamp("ERROR", "WRITER") << "Cannot set filename when destination is standard out..." << std::endl; - return false; - } - void flush(void){ std::cout.flush(); } - inline bool close(void){ return true; } - inline std::ostream& getStream(void){ return(std::cout); } - - const size_t write(const char* data, const U64& length){ - this->lock.lock(); - std::cout.write(&data[0], length); - this->lock.unlock(); - return(length); - } - - inline const size_t writeNoLock(const char* data, const U32 length){ - std::cout.write(&data[0], length); - return(length); - } - - inline const size_t writeNoLock(const buffer_type& buffer){ - std::cout.write(buffer.data(), buffer.size()); - return(buffer.pointer); - } - - void operator<<(void* entry){} - - void operator<<(const buffer_type& buffer){ - // Mutex lock; write; unlock - // Note that this threads enter here at random - // Extremely unlikely there is every any contention - this->lock.lock(); - std::cout.write(buffer.data(), buffer.size()); - this->lock.unlock(); - } -}; - -// case file -class WriterFile : public GenericWriterInterace{ -public: - WriterFile(){} - ~WriterFile(){ - // Flush upon termination - this->flush(); - this->close(); - } - - bool open(void){ - std::cerr << utility::timestamp("ERROR", "WRITER") << "No output name provided..." << std::endl; - return false; - } - - bool open(const std::string output){ - if(output.length() == 0){ - std::cerr << utility::timestamp("ERROR", "WRITER") << "No output name provided..." << std::endl; - return false; - } - - this->stream.open(output, std::ios::binary | std::ios::out); - if(!this->stream.good()){ - std::cerr << utility::timestamp("ERROR", "WRITER") << "Could not open output file: " << output << "..." << std::endl; - return false; - } - - if(!SILENT) - std::cerr << utility::timestamp("LOG", "WRITER") << "Opening output file: " << output << "..." << std::endl; - - return true; - } - - inline std::ostream& getStream(void){ return(this->stream); } - inline std::ofstream& getNativeStream(void){ return(this->stream); } - inline void flush(void){ this->stream.flush(); } - inline bool close(void){ this->stream.close(); return true; } - - void operator<<(const buffer_type& buffer){ - // Mutex lock; write; unlock - // Note that this threads enter here at random - // Extremely unlikely there is every any contention - this->lock.lock(); - this->stream.write(buffer.data(), buffer.size()); - this->lock.unlock(); - } - - void operator<<(void* entry){} - const size_t write(const char* data, const U64& length){ - this->lock.lock(); - this->stream.write(&data[0], length); - this->lock.unlock(); - return(length); - } - - inline const size_t writeNoLock(const char* data, const U32 length){ - this->stream.write(&data[0], length); - return(length); - } - - inline const size_t writeNoLock(const buffer_type& buffer){ - this->stream.write(buffer.data(), buffer.size()); - return(buffer.pointer); - } - -private: - std::string outFile; - std::ofstream stream; -}; - -} /* namespace IO */ -} /* namespace Tomahawk */ - -#endif /* TOMAHAWK_BASICWRITERS_H_ */ diff --git a/lib/io/basic_buffer.cpp b/lib/io/basic_buffer.cpp new file mode 100644 index 0000000..daef3c8 --- /dev/null +++ b/lib/io/basic_buffer.cpp @@ -0,0 +1,20 @@ +#include "basic_buffer.h" + +namespace tachyon{ +namespace io{ + +void SerializeString(const std::string& string, io::BasicBuffer& buffer){ + uint32_t size_helper = string.size(); + buffer += size_helper; + buffer += string; +} + +void DeserializeString(std::string& string, io::BasicBuffer& buffer){ + uint32_t size_helper; + buffer >> size_helper; + string.resize(size_helper); + buffer.read(&string[0], size_helper); +} + +} +} diff --git a/lib/io/basic_buffer.h b/lib/io/basic_buffer.h index 9715646..9412aa1 100644 --- a/lib/io/basic_buffer.h +++ b/lib/io/basic_buffer.h @@ -3,6 +3,9 @@ #include #include + +#include + #include "support/type_definitions.h" #include "support/helpers.h" @@ -106,24 +109,13 @@ struct BasicBuffer{ inline const U64& capacity(void) const{ return this->width; } void resize(const U64 new_size){ - if(new_size <= this->capacity()){ - if(new_size < this->size()) - this->n_chars = new_size; - - return; - } - - U64 copy_to = this->size(); - if(new_size < this->size()){ - copy_to = new_size; - this->n_chars = copy_to; - } - - //std::cerr << utility::timestamp("DEBUG") << "Resizing buffer: " << this->capacity() << " -> " << new_size << "\tcopyto: " << copy_to << std::endl; - char* target = this->buffer; - this->buffer = new char[new_size]; - memcpy(&this->buffer[0], &target[0], copy_to); - delete [] target; + if(this->n_chars == 0 && new_size == 0) return; + char* temp = new char[new_size]; + //std::cerr << this->size() << "<" << new_size << std::endl; + assert(this->size() < new_size); + memcpy(temp, this->buffer, this->size()); + delete [] this->buffer; + this->buffer = temp; this->width = new_size; } @@ -143,70 +135,70 @@ struct BasicBuffer{ void AddReadble(const SBYTE& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%d", value); this->n_chars += ret; } void AddReadble(const S16& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%d", value); this->n_chars += ret; } void AddReadble(const S32& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%d", value); this->n_chars += ret; } void AddReadble(const BYTE& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%u", value); this->n_chars += ret; } void AddReadble(const U16& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%u", value); this->n_chars += ret; } void AddReadble(const U32& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%u", value); this->n_chars += ret; } void AddReadble(const U64& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%llu", value); this->n_chars += ret; } void AddReadble(const float& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%g", value); this->n_chars += ret; } void AddReadble(const double& value){ if(this->n_chars + 100 >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 100, this->width*2)); const int ret = sprintf(&this->buffer[this->n_chars], "%g", value); this->n_chars += ret; } void AddReadble(const std::string& value){ if(this->n_chars + value.size() >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->n_chars + value.size() + 100, this->width*2)); *this += value; } @@ -222,7 +214,7 @@ struct BasicBuffer{ inline self_type& operator+=(const char& value){ if(this->n_chars + sizeof(char) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); this->buffer[this->n_chars] = value; ++this->n_chars; @@ -231,7 +223,7 @@ struct BasicBuffer{ inline self_type& operator+=(const BYTE& value){ if(this->n_chars + sizeof(BYTE) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); BYTE* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -241,7 +233,7 @@ struct BasicBuffer{ inline self_type& operator+=(const float& value){ if(this->n_chars + sizeof(float) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); float* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -251,7 +243,7 @@ struct BasicBuffer{ inline self_type& operator+=(const U16& value){ if(this->n_chars + sizeof(U16) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); U16* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -261,7 +253,7 @@ struct BasicBuffer{ inline self_type& operator+=(const short& value){ if(this->n_chars + sizeof(short) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); short* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -271,7 +263,7 @@ struct BasicBuffer{ inline self_type& operator+=(const U32& value){ if(this->n_chars + sizeof(U32) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); U32* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -281,7 +273,7 @@ struct BasicBuffer{ inline self_type& operator+=(const S32& value){ if(this->n_chars + sizeof(S32) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); S32* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -291,7 +283,7 @@ struct BasicBuffer{ inline self_type& operator+=(const double& value){ if(this->n_chars + sizeof(double) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); double* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -301,7 +293,7 @@ struct BasicBuffer{ inline self_type& operator+=(const U64& value){ if(this->n_chars + sizeof(U64) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); U64* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -309,9 +301,19 @@ struct BasicBuffer{ return *this; } + inline self_type& operator+=(const int64_t& value){ + if(this->n_chars + sizeof(int64_t) >= this->width) + this->resize(std::max(this->width + 1000, this->width*2)); + + int64_t* p = reinterpret_cast(&this->buffer[this->n_chars]); + *p = value; + this->n_chars += sizeof(int64_t); + return *this; + } + inline self_type& operator+=(const size_t& value){ if(this->n_chars + sizeof(size_t) >= this->width) - this->resize(this->width*2); + this->resize(std::max(this->width + 1000, this->width*2)); size_t* p = reinterpret_cast(&this->buffer[this->n_chars]); *p = value; @@ -321,10 +323,7 @@ struct BasicBuffer{ inline self_type& operator+=(const std::string& value){ if(this->n_chars + value.size() + sizeof(BYTE) >= this->width){ - U64 resize_to = this->width * 2; - while(this->n_chars + value.size() + sizeof(BYTE) >= resize_to) - resize_to *= 2; - + U64 resize_to = std::max(this->n_chars + value.size() + sizeof(BYTE) + 1000, this->width * 2); this->resize(resize_to); } @@ -395,6 +394,12 @@ struct BasicBuffer{ return(data); } + friend self_type& operator>>(self_type& data, int64_t& target){ + target = *reinterpret_cast(&data.buffer[data.iterator_position_]); + data.iterator_position_ += sizeof(int64_t); + return(data); + } + friend self_type& operator>>(self_type& data, float& target){ target = *reinterpret_cast(&data.buffer[data.iterator_position_]); data.iterator_position_ += sizeof(float); @@ -426,6 +431,19 @@ struct BasicBuffer{ pointer buffer; }; +void SerializeString(const std::string& string, io::BasicBuffer& buffer); +void DeserializeString(std::string& string, io::BasicBuffer& buffer); + +template +static void SerializePrimitive(const T& value, io::BasicBuffer& buffer){ + buffer += value; +} + +template +static void DeserializePrimitive(T& value, io::BasicBuffer& buffer){ + buffer >> value; +} + } /* namespace IO */ } /* namespace Tomahawk */ diff --git a/lib/io/BasicReader.cpp b/lib/io/basic_reader.cpp similarity index 76% rename from lib/io/BasicReader.cpp rename to lib/io/basic_reader.cpp index 1d70a5d..99df594 100644 --- a/lib/io/BasicReader.cpp +++ b/lib/io/basic_reader.cpp @@ -1,12 +1,45 @@ -#include "BasicReader.h" +#include "basic_reader.h" #include "support/helpers.h" namespace tachyon{ namespace io{ -BasicReader::BasicReader() : filesize_(0), block_size_(65536), capacity_(this->block_size_*2), end_(0), buffer_(new type[this->capacity_]){} -BasicReader::BasicReader(std::string input) : filename_(input), filesize_(0), block_size_(65536), capacity_(this->block_size_*2), end_(0), buffer_(new type[this->capacity_]){} -BasicReader::BasicReader(std::string input, const size_t block_size) : filename_(input), filesize_(0), block_size_(block_size), capacity_(this->block_size_*2), end_(0), buffer_(new type[this->capacity_]){} +BasicReader::BasicReader() : + filesize_(0), + block_size_(65536), + capacity_(this->block_size_*2), + end_(0), + buffer_(new type[this->capacity_]) +{} + +BasicReader::BasicReader(std::string input) : + filename_(input), + filesize_(0), + block_size_(65536), + capacity_(this->block_size_*2), + end_(0), + buffer_(new type[this->capacity_]) +{} + +BasicReader::BasicReader(std::string input, const size_t block_size) : + filename_(input), + filesize_(0), + block_size_(block_size), + capacity_(this->block_size_*2), + end_(0), + buffer_(new type[this->capacity_]) +{} + +BasicReader::BasicReader(const self_type& other) : + filename_(other.filename_), + filesize_(other.filesize_), + block_size_(other.block_size_), + capacity_(other.capacity_), + end_(other.end_), + buffer_(new type[this->capacity_]) +{ + memcpy(this->buffer_, other.buffer_, other.end_); +} bool BasicReader::open(std::string filename){ // If filename is empty @@ -47,8 +80,8 @@ bool BasicReader::open(void){ // Reset buffer pointer to 0 this->end_ = 0; - if(!SILENT) - std::cerr << utility::timestamp("LOG", "IO") << "Opened file: " << this->filename_ << " (" << this->filesize_ << " b)..." << std::endl; + //if(!SILENT) + // std::cerr << utility::timestamp("LOG", "IO") << "Opened file: " << this->filename_ << " (" << this->filesize_ << " b)..." << std::endl; return(true); } diff --git a/lib/io/BasicReader.h b/lib/io/basic_reader.h similarity index 91% rename from lib/io/BasicReader.h rename to lib/io/basic_reader.h index 3522326..6e2ae85 100644 --- a/lib/io/BasicReader.h +++ b/lib/io/basic_reader.h @@ -26,12 +26,15 @@ class BasicReader { typedef const type &const_reference; typedef size_t size_type; typedef ptrdiff_t difference_type; + typedef BasicReader self_type; public: BasicReader(); BasicReader(std::string input); + BasicReader(const self_type& other); BasicReader(std::string input, const size_t block_size); virtual ~BasicReader(){ delete[] this->buffer_; } + virtual const_reference operator[](const size_t p) const{ return this->buffer_[p]; } virtual reference operator[](const size_t p){ return this->buffer_[p]; } @@ -97,7 +100,7 @@ class BasicReader { delete [] old; } - virtual const size_t size(void) const { return this->end_; } // Virtual -> allowed to overwrite in children classes + virtual size_t size(void) const { return this->end_; } // Virtual -> allowed to overwrite in children classes bool good(void) const{ return this->stream_.good(); } virtual bool open(void); @@ -106,14 +109,14 @@ class BasicReader { virtual bool read(void); virtual bool read(const uint32_t length); virtual bool readAppend(const uint32_t length); - inline const uint64_t& filesize(void){ return this->filesize_; } - inline uint64_t tellg(void){ return this->stream_.tellg(); } + inline const size_t& filesize(void){ return this->filesize_; } + inline size_t tellg(void){ return this->stream_.tellg(); } bool getLine(void); // Read until finding a new line into buffer bool getLine(std::string& data); // Read until finding a new line into string public: std::string filename_; // Input file name - uint64_t filesize_; // Input file size + size_t filesize_; // Input file size size_t block_size_; // Size of block read each iteration size_t capacity_; // Capacity of buffer size_t end_; // End pointer of data in buffer diff --git a/lib/io/bcf/BCFEntry.cpp b/lib/io/bcf/BCFEntry.cpp deleted file mode 100644 index 498d3c4..0000000 --- a/lib/io/bcf/BCFEntry.cpp +++ /dev/null @@ -1,543 +0,0 @@ -#include "BCFEntry.h" - -#include -#include - - -namespace tachyon { -namespace bcf { - -BCFEntry::BCFEntry(void): - l_data(0), - l_capacity(128144), - l_ID(0), - ref_alt(0), - isGood(false), - data(new char[this->l_capacity]), - body(reinterpret_cast(this->data)), - alleles(new string_type[100]), - ID(nullptr), - hasGenotypes(false), - ploidy(0), - filter_start(0), - n_filter(0), - filterPointer(0), - infoPointer(0), - formatPointer(0), - filterID(new BCFKeyTuple[256]), - infoID(new BCFKeyTuple[256]), - formatID(new BCFKeyTuple[256]) -{ - -} - -BCFEntry::BCFEntry(const U64 start_capacity): - l_data(0), - l_capacity(256000 + start_capacity), - l_ID(0), - ref_alt(0), - isGood(false), - data(new char[this->l_capacity]), - body(reinterpret_cast(this->data)), - alleles(new string_type[100]), - ID(nullptr), - hasGenotypes(false), - ploidy(0), - filter_start(0), - n_filter(0), - filterPointer(0), - infoPointer(0), - formatPointer(0), - filterID(new BCFKeyTuple[256]), - infoID(new BCFKeyTuple[256]), - formatID(new BCFKeyTuple[256]) -{ - -} - - -BCFEntry::BCFEntry(const self_type& other): - l_data(other.l_data), - l_capacity(other.l_capacity), - l_ID(other.l_ID), - ref_alt(other.ref_alt), - isGood(other.isGood), - data(new char[other.l_capacity]), - body(nullptr), - alleles(new string_type[100]), - ID(nullptr), - hasGenotypes(other.hasGenotypes), - ploidy(other.ploidy), - filter_start(other.filter_start), - n_filter(other.n_filter), - filterPointer(other.filterPointer), - infoPointer(other.infoPointer), - formatPointer(other.formatPointer), - gt_support(other.gt_support), - filterID(new BCFKeyTuple[256]), - infoID(new BCFKeyTuple[256]), - formatID(new BCFKeyTuple[256]) -{ - memcpy(this->data, other.data, other.l_data); - this->body = reinterpret_cast(this->data); - - for(U32 i = 0; i < 256; ++i){ - this->filterID[i] = other.filterID[i]; - this->formatID[i] = other.formatID[i]; - this->infoID[i] = other.infoID[i]; - } - - U32 internal_pos = sizeof(body_type); - this->__parseID(internal_pos); - //this->__parseRefAlt(internal_pos); - //this->SetRefAlt(); - for(U32 i = 0; i < this->body->n_allele; ++i) - this->alleles[i] = other.alleles[i]; -} - -BCFEntry::BCFEntry(self_type&& other) noexcept : - l_data(other.l_data), - l_capacity(other.l_capacity), - l_ID(other.l_ID), - ref_alt(other.ref_alt), - isGood(other.isGood), - data(other.data), - body(other.body), - alleles(other.alleles), - ID(other.ID), - hasGenotypes(other.hasGenotypes), - ploidy(other.ploidy), - filter_start(other.filter_start), - n_filter(other.n_filter), - filterPointer(other.filterPointer), - infoPointer(other.infoPointer), - formatPointer(other.formatPointer), - gt_support(other.gt_support), - filterID(other.filterID), - infoID(other.infoID), - formatID(other.formatID) -{ - other.data = nullptr; - other.body = nullptr; - other.alleles = nullptr; - other.ID = nullptr; - other.filterID = nullptr; - other.infoID = nullptr; - other.formatID = nullptr; -} - -BCFEntry& BCFEntry::operator=(const self_type& other){ - self_type tmp(other); // re-use copy-constructor - *this = std::move(tmp); // re-use move-assignment - return *this; -} - -BCFEntry& BCFEntry::operator=(self_type&& other) noexcept{ - if (this == &other) - { - // take precautions against `foo = std::move(foo)` - return *this; - } - - delete [] this->data; - delete [] this->alleles; - delete [] this->filterID; - delete [] this->infoID; - delete [] this->formatID; - this->l_data = other.l_data; - this->l_capacity = other.l_capacity; - this->l_ID = other.l_ID; - this->ref_alt = other.ref_alt; - this->isGood = other.isGood; - this->data = other.data; - this->body = other.body; - this->alleles = other.alleles; - this->ID = other.ID; - this->hasGenotypes = other.hasGenotypes; - this->ploidy = other.ploidy; - this->filter_start = other.filter_start; - this->n_filter = other.n_filter; - this->filterPointer = other.filterPointer; - this->infoPointer = other.infoPointer; - this->formatPointer = other.formatPointer; - this->filterID = other.filterID; - this->infoID = other.infoID; - this->formatID = other.formatID; - this->gt_support = other.gt_support; - - other.data = nullptr; - other.alleles = nullptr; - other.filterID = nullptr; - other.infoID = nullptr; - other.formatID = nullptr; - - return *this; -} - -BCFEntry::~BCFEntry(void){ - delete [] this->data; - delete [] this->alleles; - delete [] this->filterID; - delete [] this->infoID; - delete [] this->formatID; -} - -void BCFEntry::resize(const U32 size){ - if(size == 0) - return; - - char* temp = this->data; - this->data = new char[size]; - memcpy(this->data, temp, this->l_data); - delete [] temp; - this->body = reinterpret_cast(this->data); - - if(size > this->l_capacity) - this->l_capacity = size; -} - -void BCFEntry::add(const char* const data, const U32 length){ - if(this->l_data + length > this-> capacity()) - this->resize(this->l_data + length + 65536); - - memcpy(&this->data[this->l_data], data, length); - this->l_data += length; -} - -void BCFEntry::__parseID(U32& internal_pos){ - // Parse ID - const base_type& ID_base = *reinterpret_cast(&this->data[internal_pos++]); -#if BCF_ASSERT == 1 - assert(ID_base.low == 7); -#endif - - this->ID = &this->data[internal_pos]; - this->l_ID = ID_base.high; - if(ID_base.high == 0){ // has no name - this->l_ID = 0; - } else if(ID_base.high == 15){ - // next byte is the length array - // Type and length - const base_type& array_base = *reinterpret_cast(&this->data[internal_pos++]); - this->l_ID = this->getInteger(array_base.low, internal_pos); - this->ID = &this->data[internal_pos]; - } - internal_pos += this->l_ID; -} - -void BCFEntry::__parseRefAlt(U32& internal_pos){ - // Parse REF-ALT - for(U32 i = 0; i < this->body->n_allele; ++i){ - const base_type& alelle_base = *reinterpret_cast(&this->data[internal_pos++]); -#if BCF_ASSERT == 1 - assert(alelle_base.low == 7); -#endif - - S32 length = alelle_base.high; - const char* ref_alt_data = &this->data[internal_pos]; - - if(alelle_base.high == 15){ - // Type and length - const base_type& array_base = *reinterpret_cast(&this->data[internal_pos++]); - length = this->getInteger(array_base.low, internal_pos); - ref_alt_data = &this->data[internal_pos]; - } - this->alleles[i](ref_alt_data, length); - - //std::cerr << std::string(this->alleles[i].data, this->alleles[i].length) << std::endl; - internal_pos += this->alleles[i].length; - } -} - -bool BCFEntry::nextFilter(S32& value, U32& position){ - if(this->filterPointer == this->n_filter) - return false; - - value = this->getInteger(this->filter_key.low, position); - this->filterID[this->filterPointer++].mapID = value; - - return true; -} - -bool BCFEntry::nextInfo(S32& value, U32& length, BYTE& value_type, U32& position){ - if(this->infoPointer == this->body->n_info) - return false; - - const base_type& info_key = *reinterpret_cast(&this->data[position++]); - #if BCF_ASSERT == 1 - // The first object returns a single identifier - // to a field. It should always be a single - // value - assert(info_key.high == 1); - #endif - - // INFO identifier - value = this->getInteger(info_key.low, position); - this->infoID[this->infoPointer++].mapID = value; - - // Data for this identifier - const base_type& info_value = *reinterpret_cast(&this->data[position++]); - length = info_value.high; - if(length == 15){ - const base_type& array_base = *reinterpret_cast(&this->data[position++]); - length = this->getInteger(array_base.low, position); - } - value_type = info_value.low; - - return true; -} - -bool BCFEntry::nextFormat(S32& value, U32& length, BYTE& value_type, U32& position){ - if(this->formatPointer == this->body->n_fmt) - return false; - - const base_type& format_key = *reinterpret_cast(&this->data[position++]); - #if BCF_ASSERT == 1 - // This first bit returns a single identifier - // to a field. It should always be a single - // value - assert(format_key.high == 1); - #endif - - // format identifier - value = this->getInteger(format_key.low, position); - this->formatID[this->formatPointer++].mapID = value; - - // Data for this identifier - const base_type& format_value = *reinterpret_cast(&this->data[position++]); - length = format_value.high; - if(length == 15){ - const base_type& array_base = *reinterpret_cast(&this->data[position++]); - length = this->getInteger(array_base.low, position); - } - value_type = format_value.low; - - return true; -} - -bool BCFEntry::parse(const U64 n_samples){ - this->body = reinterpret_cast(this->data); - U32 internal_pos = sizeof(body_type); - this->__parseID(internal_pos); - this->__parseRefAlt(internal_pos); - this->SetRefAlt(); - - // start of FILTER - this->filter_start = internal_pos; - - // At FILTER - // Typed vector - const base_type& filter_key = *reinterpret_cast(&this->data[internal_pos++]); - U32 n_filter = filter_key.high; - if(n_filter == 15) n_filter = this->getInteger(filter_key.low, internal_pos); - this->n_filter = n_filter; - this->filter_key = filter_key; - - S32 val = 0; - while(this->nextFilter(val, internal_pos)){} - - // At INFO - U32 info_length; - BYTE info_primitive_type; - for(U32 i = 0; i < this->body->n_info; ++i){ - if(this->nextInfo(val, info_length, info_primitive_type, internal_pos) == false){ - std::cerr << "illegal match info" << std::endl; - exit(1); - } - this->infoID[i].l_stride = info_length; - this->infoID[i].primitive_type = info_primitive_type; - this->infoID[i].l_offset = internal_pos; - - // Flags and integers - // These are BCF value types - if(info_primitive_type <= 3){ - for(U32 j = 0; j < info_length; ++j){ - this->getInteger(info_primitive_type, internal_pos); - } - } - // Floats - else if(info_primitive_type == 5){ - for(U32 j = 0; j < info_length; ++j){ - this->getFloat(internal_pos); - } - } - // Chars - else if(info_primitive_type == 7){ - for(U32 j = 0; j < info_length; ++j){ - this->getChar(internal_pos); - } - } - // Illegal: parsing error - else { - std::cerr << "impossible in info: " << (int)info_primitive_type << std::endl; - exit(1); - } - } - - assert(internal_pos == (this->body->l_shared + sizeof(U32)*2)); - - if(internal_pos == this->l_data){ - //std::cerr << "have no FORMAT data" << std::endl; - return true; - } - - BYTE format_primitive_type = 0; - for(U32 i = 0; i < this->body->n_fmt; ++i){ - if(this->nextFormat(val, info_length, format_primitive_type, internal_pos) == false){ - std::cerr << "illegal match format" << std::endl; - exit(1); - } - - this->formatID[i].l_stride = info_length; - this->formatID[i].primitive_type = format_primitive_type; - this->formatID[i].l_offset = internal_pos; - - // Flags and integers - // These are BCF value types - if(format_primitive_type <= 3){ - for(U32 s = 0; s < n_samples; ++s){ - for(U32 j = 0; j < info_length; ++j) - this->getInteger(format_primitive_type, internal_pos); - } - } - // Floats - else if(format_primitive_type == 5){ - for(U32 s = 0; s < n_samples; ++s){ - for(U32 j = 0; j < info_length; ++j) - this->getFloat(internal_pos); - - } - } - // Chars - else if(format_primitive_type == 7){ - for(U32 s = 0; s < n_samples; ++s){ - for(U32 j = 0; j < info_length; ++j) - this->getChar(internal_pos); - } - } - // Illegal: parsing error - else { - std::cerr << "impossible in format: " << (int)format_primitive_type << std::endl; - std::cerr << utility::timestamp("LOG") << val << '\t' << info_length << '\t' << (int)format_primitive_type << '\t' << internal_pos << '/' << this->l_data << std::endl; - exit(1); - } - } - - assert(internal_pos == this->l_data); - - this->isGood = true; - return true; -} - -void BCFEntry::SetRefAlt(void){ - this->ref_alt = 0; - // - if(this->alleles[0].length == 9 && strncmp(this->alleles[0].data, "", 9) == 0){ - this->ref_alt ^= constants::REF_ALT_NON_REF << 4; - } - - if(this->alleles[1].length == 9 && strncmp(this->alleles[1].data, "", 9) == 0){ - this->ref_alt ^= constants::REF_ALT_NON_REF << 0; - } - - // Set mock ref-alt if not simple - if(this->alleles[0].length != 1 || this->alleles[1].length != 1){ - return; - } - - switch(this->alleles[0].data[0]){ - case 'A': this->ref_alt ^= constants::REF_ALT_A << 4; break; - case 'T': this->ref_alt ^= constants::REF_ALT_T << 4; break; - case 'G': this->ref_alt ^= constants::REF_ALT_G << 4; break; - case 'C': this->ref_alt ^= constants::REF_ALT_C << 4; break; - case 'N': this->ref_alt ^= constants::REF_ALT_N << 4; break; - default: - std::cerr << utility::timestamp("ERROR", "BCF") << "Illegal SNV reference..." << std::endl; - std::cerr << this->alleles[1].data << std::endl; - std::cerr << this->alleles[0].data << std::endl; - exit(1); - } - - switch(this->alleles[1].data[0]){ - case 'A': this->ref_alt ^= constants::REF_ALT_A << 0; break; - case 'T': this->ref_alt ^= constants::REF_ALT_T << 0; break; - case 'G': this->ref_alt ^= constants::REF_ALT_G << 0; break; - case 'C': this->ref_alt ^= constants::REF_ALT_C << 0; break; - case '.': this->ref_alt ^= constants::REF_ALT_MISSING << 0; break; - case 'N': this->ref_alt ^= constants::REF_ALT_N << 0; break; - default: - std::cerr << utility::timestamp("ERROR", "BCF") << "Illegal SNV alt..." << std::endl; - exit(1); - } -} - -bool BCFEntry::assessGenotypes(const U64 n_samples){ - if(this->hasGenotypes == false) - return false; - - const BYTE ploidy = this->formatID[0].l_stride; - // Todo: other primitives - U64 current_sample = 0; - const char* const internal_data = &this->data[this->formatID[0].l_offset]; - U32 internal_data_offset = 0; - this->gt_support.ploidy = ploidy; - this->gt_support.hasGenotypes = true; - - BYTE first_phase = 0; - - // TODO other BCF primitives - if(this->formatID[0].primitive_type == YON_BCF_PRIMITIVE_TYPES::BCF_BYTE){ - // First - bool invariant = true; - for(U32 p = 0; p < ploidy; ++p){ - const BYTE& ref = *reinterpret_cast(&internal_data[internal_data_offset]); - if((ref >> 1) == 0){ - this->gt_support.hasMissing = true; - ++this->gt_support.n_missing; - } else if(ref == 0x81){ - //std::cerr << "hit first" << std::endl; - this->gt_support.hasEOV = true; - ++this->gt_support.n_eov; - } - if(p + 1 == ploidy) first_phase = ref & 1; - if(ref >> 1 != 1) invariant = false; - internal_data_offset += sizeof(BYTE); - } - ++current_sample; - - for(U32 i = ploidy; i < n_samples*ploidy; i+=ploidy, ++current_sample){ - // retrieve ploidy primitives - for(U32 p = 0; p < ploidy; ++p){ - const BYTE& ref = *reinterpret_cast(&internal_data[internal_data_offset]); - if((ref >> 1) == 0){ - //std::cerr << "is missing" << std::endl; - this->gt_support.hasMissing = true; - ++this->gt_support.n_missing; - } else if(ref == 0x81){ - //std::cerr << "is vector eof" << std::endl; - this->gt_support.hasEOV = true; - ++this->gt_support.n_eov; - } - - if(p + 1 == ploidy){ - if(first_phase != (ref & 1)){ - //std::cerr << "triggering mixed phase" << std::endl; - this->gt_support.mixedPhasing = true; - this->gt_support.phase = 0; - } - } - internal_data_offset += sizeof(BYTE); - if(ref >> 1 != 1) invariant = false; - } - } - - if(this->gt_support.mixedPhasing == false){ - this->gt_support.phase = first_phase; - } - this->gt_support.invariant = invariant; - } - return true; -} - -} -} diff --git a/lib/io/bcf/BCFEntry.h b/lib/io/bcf/BCFEntry.h deleted file mode 100644 index 609b055..0000000 --- a/lib/io/bcf/BCFEntry.h +++ /dev/null @@ -1,359 +0,0 @@ -#ifndef BCF_BCFENTRY_H_ -#define BCF_BCFENTRY_H_ - -#include "io/vcf/VCFHeader.h" -#include "third_party/xxhash/xxhash.h" - -namespace tachyon { -namespace bcf { - -#define BCF_ENTRY_BASE_ALLOCATION 262144 -// Enforce assertions of correctness -#define BCF_ASSERT 1 -// Hash-specific seed -#define BCF_HASH_SEED 452930477 - -const BYTE BCF_UNPACK_TACHYON[3] = {2, 0, 1}; -#define BCF_UNPACK_GENOTYPE(A) BCF_UNPACK_TACHYON[((A) >> 1)] -const char BCF_TYPE_SIZE[8] = {0,1,2,4,0,4,0,1}; - -/**< - * BCF-specific primitive triggers - */ -enum YON_BCF_PRIMITIVE_TYPES{ - BCF_FLAG = 0, //!< BCF_FLAG - BCF_BYTE = 1, //!< BCF_BYTE - BCF_U16 = 2, //!< BCF_U16 - BCF_U32 = 3, //!< BCF_U32 - BCF_FLOAT = 5,//!< BCF_FLOAT - BCF_CHAR = 7 //!< BCF_CHAR -}; - -#pragma pack(push, 1) -struct __attribute__((packed, aligned(1))) BCFAtomicBase{ - BYTE low: 4, high: 4; -}; - -struct __attribute__((packed, aligned(1))) BCFEntryBody{ - typedef BCFEntryBody self_type; - - BCFEntryBody(); // disallow ctor - ~BCFEntryBody(); // disallow dtor - - // For debugging only - friend std::ostream& operator<<(std::ostream& os, const self_type& header){ - os << "l_shared\t" << (U32)header.l_shared << '\n'; - os << "l_indiv\t" << (U32)header.l_indiv << '\n'; - os << "CHROM\t" << (U32)header.CHROM << '\n'; - os << "POS\t" << (U32)header.POS << '\n'; - os << "rlen\t" << (S32)header.rlen << '\n'; - os << "QUAL\t" << (U32)header.QUAL << '\n'; - os << "n_allele\t" << (U32)header.n_allele << '\n'; - os << "n_info\t" << (U16)header.n_info << '\n'; - os << "n_fmt\t" << (U32)header.n_fmt << '\n'; - os << "n_sample\t" << (U32)header.n_sample; - - return os; - } - - U32 l_shared; - U32 l_indiv; - S32 CHROM; - S32 POS; - S32 rlen; - float QUAL; - U32 n_info: 16, n_allele: 16; - U32 n_sample: 24, n_fmt: 8; -}; -#pragma pack(pop) - -struct BCFTypeString{ - typedef BCFTypeString self_type; - - BCFTypeString() : length(0), data(nullptr){} - BCFTypeString(const self_type& other) : length(other.length), data(new char[other.length]){ memcpy(this->data, other.data, other.length); } - void operator=(const self_type& other){ - this->length = other.length; - delete [] this->data; - this->data = new char[other.length]; - memcpy(this->data, other.data, other.length); - } - void operator()(const char* const data, const U16 length){ - this->length = length; - delete [] this->data; - this->data = new char[length]; - memcpy(this->data, data, length); - } - ~BCFTypeString(){ - delete [] this->data; - } - - U16 length; - char* data; -}; - -struct BCFKeyTuple{ - BCFKeyTuple() : mapID(-1), primitive_type(0), l_stride(0), l_offset(0){} - ~BCFKeyTuple(){} - - S32 mapID; - BYTE primitive_type; - U32 l_stride; - U32 l_offset; -}; - -struct BCFGenotypeSupport{ -private: - typedef BCFGenotypeSupport self_type; - -public: - BCFGenotypeSupport() : - hasGenotypes(false), - hasMissing(false), - hasEOV(false), - mixedPhasing(false), - invariant(false), - ploidy(0), - phase(0), - n_missing(0), - n_eov(0) - {} - - BCFGenotypeSupport(const self_type& other) : - hasGenotypes(other.hasGenotypes), - hasMissing(other.hasMissing), - hasEOV(other.hasEOV), - mixedPhasing(other.mixedPhasing), - invariant(other.invariant), - ploidy(other.ploidy), - phase(other.phase), - n_missing(other.n_missing), - n_eov(other.n_eov) - { - } - - void operator=(const self_type& other){ - this->hasGenotypes = other.hasGenotypes; - this->hasMissing = other.hasMissing; - this->hasEOV = other.hasEOV; - this->mixedPhasing = other.mixedPhasing; - this->invariant = other.invariant; - this->ploidy = other.ploidy; - this->phase = other.phase; - this->n_missing = other.n_missing; - this->n_eov = other.n_eov; - } - -public: - bool hasGenotypes; - bool hasMissing; - bool hasEOV; - bool mixedPhasing; - bool invariant; - U32 ploidy; - BYTE phase; - U32 n_missing; - U32 n_eov; -}; - -struct BCFEntry{ -private: - typedef BCFEntry self_type; - typedef io::BasicBuffer buffer_type; - typedef BCFEntryBody body_type; - typedef BCFTypeString string_type; - typedef BCFAtomicBase base_type; - typedef BCFGenotypeSupport gt_support_type; - -public: - BCFEntry(void); // ctor - BCFEntry(const U64 start_capacity); // ctor - BCFEntry(const self_type& other); // copy ctor - BCFEntry(self_type&& other) noexcept; - BCFEntry& operator=(const self_type& other); - BCFEntry& operator=(self_type&& other) noexcept; - ~BCFEntry(void); // dtor - - void resize(const U32 size); - void add(const char* const data, const U32 length); - - inline void reset(void){ - this->l_data = 0; - this->isGood = false; - this->infoPointer = 0; - this->formatPointer = 0; - this->filterPointer = 0; - this->n_filter = 0; - this->filter_start = 0; - this->ploidy = 0; - this->hasGenotypes = false; - } - - inline const U32& size(void) const{ return(this->l_data); } - inline const U32& capacity(void) const{ return(this->l_capacity); } - inline const U64 sizeBody(void) const{ return(this->body->l_shared + this->body->l_indiv); } - - inline const bool isBiallelicSimple(void) const{ - return((this->body->n_allele == 2) && (this->alleles[0].length == 1 && this->alleles[1].length == 1)); - } - inline const bool isBiallelic(void) const{ return(this->body->n_allele == 2); } - inline const bool isSimple(void) const{ - return(this->alleles[0].length == 1 && this->alleles[1].length == 1); - } - - void __parseID(U32& internal_pos); - void __parseRefAlt(U32& internal_pos); - - bool parse(const U64 n_samples); - - void SetRefAlt(void); - - //s - bool assessGenotypes(const U64 n_samples); - - inline const bool& good(void) const{ return(this->isGood); } - - /**< - * Decode an integer primitive from a BCF buffer stream. Forces all - * return types to be of type S32 and with missing and EOV values - * expanded to match this possibly larger primitive type. - * @param key - * @param pos - * @return - */ - inline const S32 getInteger(const BYTE& key, U32& pos){ - S32 value = 0; - if(key == 1){ - const SBYTE& ref = *reinterpret_cast(&this->data[pos++]); - const BYTE& uref = *reinterpret_cast(&ref); - if(uref == 0x80){ - //std::cerr << "is missing" << std::endl; - return(value = 0x80000000); - } else if(uref == 0x81){ - return(value = 0x80000001); - //std::cerr << "is vector eof" << std::endl; - } - return(value = ref); - } else if(key == 2){ - const S16& ref = *reinterpret_cast(&this->data[pos]); - const U16& uref = *reinterpret_cast(&ref); - pos+=sizeof(S16); - - if(uref == 0x8000){ - //std::cerr << "is missing s16" << std::endl; - return(value = 0x80000000); - } else if(uref == 0x8001){ - //std::cerr << "is vector eof" << std::endl; - return(value = 0x80000001); - } - return(value = ref); - } else if(key == 3){ - const S32& ref = *reinterpret_cast(&this->data[pos]); - const U32& uref = *reinterpret_cast(&ref); - pos+=sizeof(S32); - - if(uref == 0x80000000){ - //std::cerr << "is missing" << std::endl; - return(value = 0x80000000); - } else if(uref == 0x80000001){ - //std::cerr << "is vector eof" << std::endl; - return(value = 0x80000001); - } - return(value = ref); - } else if(key == 0){ - return 0; - } else { - std::cerr << "illegal type" << std::endl; - exit(1); - } - } - - /**< - * Decodes a float values from a BCF buffer stream - * @param pos - * @return - */ - inline const float getFloat(U32& pos){ - const float& val = *reinterpret_cast(&this->data[pos]); - pos += sizeof(float); - return val; - } - - /**< - * Decodes a char from a BCF buffer stream - * @param pos - * @return - */ - inline const char getChar(U32& pos){ return(this->data[pos++]); } - inline const char* const getCharPointer(U32& pos){ return(&this->data[pos]); } - - // Hash field identifiers - inline const U64 hashFilter(void){ return(this->__hashTarget(this->filterID, this->filterPointer)); } - inline const U64 hashInfo(void){ return(this->__hashTarget(this->infoID, this->infoPointer)); } - inline const U64 hashFormat(void){ return(this->__hashTarget(this->formatID, this->formatPointer)); } - - // Iterators over fields - bool nextFilter(S32& value, U32& position); - bool nextInfo(S32& value, U32& length, BYTE& value_type, U32& position); - bool nextFormat(S32& value, U32& length, BYTE& value_type, U32& position); - -private: - /**< - * Calculates the 64-bit hash value for the target FORMAT/FILTER/INFO fields - * @param tuples Input target of BCFTuples - * @param n_entries Number of BCFTuples in input - * @return Returns a 64-bit hash value - */ - const U64 __hashTarget(const BCFKeyTuple* tuples, const U16& n_entries){ - XXH64_state_t* const state = XXH64_createState(); - if (state==NULL) abort(); - - XXH_errorcode const resetResult = XXH64_reset(state, BCF_HASH_SEED); - if (resetResult == XXH_ERROR) abort(); - - for(U32 i = 0; i < n_entries; ++i){ - XXH_errorcode const addResult = XXH64_update(state, (const void*)&tuples[i].mapID, sizeof(S32)); - if (addResult == XXH_ERROR) abort(); - } - - U64 hash = XXH64_digest(state); - XXH64_freeState(state); - - return hash; - } - -public: - U32 l_data; // byte width - U32 l_capacity; // capacity - U32 l_ID; - BYTE ref_alt; // parsed - bool isGood; - char* data; // hard copy data to buffer, interpret internally - body_type* body; // BCF2 body - string_type* alleles; // pointer to pointer of ref alleles and their lengths - char* ID; - - bool hasGenotypes; - BYTE ploidy; - - // - U32 filter_start; - U32 n_filter; - base_type filter_key; - - // Vectors of identifiers - U16 filterPointer; - U16 infoPointer; - U16 formatPointer; - gt_support_type gt_support; - - // FILTER - BCFKeyTuple* filterID; - BCFKeyTuple* infoID; - BCFKeyTuple* formatID; -}; - -} -} - -#endif /* BCF_BCFENTRY_H_ */ diff --git a/lib/io/bcf/BCFReader.cpp b/lib/io/bcf/BCFReader.cpp deleted file mode 100644 index 07c3f47..0000000 --- a/lib/io/bcf/BCFReader.cpp +++ /dev/null @@ -1,316 +0,0 @@ -#include "io/bcf/BCFReader.h" - -namespace tachyon{ -namespace bcf{ - -BCFReader::BCFReader() : - filesize(0), - current_pointer(0), - map_gt_id(-1), - state(bcf_reader_state::BCF_INIT), - n_entries(0), - n_capacity(0), - n_carry_over(0), - entries(nullptr), - b_data_read(0), - skip_invariant_sites(false) -{} - -BCFReader::BCFReader(const std::string& file_name) : - file_name(file_name), - filesize(0), - current_pointer(0), - map_gt_id(-1), - state(bcf_reader_state::BCF_INIT), - n_entries(0), - n_capacity(0), - n_carry_over(0), - entries(nullptr), - b_data_read(0), - skip_invariant_sites(false) -{} - -BCFReader::~BCFReader(){ - if(this->entries != nullptr){ - for(std::size_t i = 0; i < this->n_entries; ++i) - ((this->entries + i)->~BCFEntry()); - - ::operator delete[](static_cast(this->entries)); - } -} - - -bool BCFReader::nextBlock(void){ - // Stream died - if(!this->stream.good()){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Stream died!" << std::endl; - this->state = bcf_reader_state::BCF_STREAM_ERROR; - return false; - } - - // EOF - if(this->stream.tellg() == this->filesize){ - this->state = bcf_reader_state::BCF_EOF; - return false; - } - - if(!this->bgzf_controller.InflateBlock(this->stream, this->buffer)){ - if(this->bgzf_controller.buffer.size() == 0) this->state = bcf_reader_state::BCF_EOF; - else this->state = bcf_reader_state::BCF_ERROR; - return false; - } - - // Reset buffer - this->buffer.reset(); - this->current_pointer = 0; - this->state = bcf_reader_state::BCF_OK; - this->b_data_read += this->bgzf_controller.buffer.size(); - - return true; -} - -bool BCFReader::nextVariant(reference entry){ - if(this->current_pointer == this->bgzf_controller.buffer.size()){ - if(!this->nextBlock()) - return false; - } - - if(this->current_pointer + 8 > this->bgzf_controller.buffer.size()){ - const S32 partial = (S32)this->bgzf_controller.buffer.size() - this->current_pointer; - entry.add(&this->bgzf_controller.buffer[this->current_pointer], this->bgzf_controller.buffer.size() - this->current_pointer); - if(!this->nextBlock()){ - std::cerr << utility::timestamp("ERROR","BCF") << "Failed to get next block in partial" << std::endl; - return false; - } - - entry.add(&this->bgzf_controller.buffer[0], 8 - partial); - this->current_pointer = 8 - partial; - } else { - entry.add(&this->bgzf_controller.buffer[this->current_pointer], 8); - this->current_pointer += 8; - } - - U64 remainder = entry.sizeBody(); - while(remainder){ - if(this->current_pointer + remainder > this->bgzf_controller.buffer.size()){ - entry.add(&this->bgzf_controller.buffer[this->current_pointer], this->bgzf_controller.buffer.size() - this->current_pointer); - remainder -= this->bgzf_controller.buffer.size() - this->current_pointer; - if(!this->nextBlock()) - return false; - - } else { - entry.add(&this->bgzf_controller.buffer[this->current_pointer], remainder); - this->current_pointer += remainder; - remainder = 0; - break; - } - } - - if(!entry.parse(this->header.samples)){ - std::cerr << "parse error" << std::endl; - exit(1); - } - - if(this->entries[this->n_entries].body->n_fmt > 0 && this->map_gt_id != -1){ - if(this->entries[this->n_entries].formatID[0].mapID == this->map_gt_id){ - this->entries[this->n_entries].hasGenotypes = true; - entry.assessGenotypes(this->header.samples); - } - } - - return true; -} - -bool BCFReader::getVariants(const U32 n_variants, const double bp_window, bool across_contigs){ - S64 firstPos = 0; - S32 firstContig = -1; - // If there is any carry over - if(this->n_carry_over == 1){ - value_type last(this->entries[this->n_entries]); - if(last.body->n_fmt > 0 && this->map_gt_id != -1){ - if(last.formatID[0].mapID == this->map_gt_id) - last.hasGenotypes = true; - } - - for(std::size_t i = 0; i <= this->n_entries; ++i) - ((this->entries + i)->~BCFEntry()); - - if(n_variants + 1 > this->capacity()){ - ::operator delete[](static_cast(this->entries)); - this->entries = static_cast(::operator new[]((n_variants + 1)*sizeof(value_type))); - this->n_capacity = n_variants + 1; - } - - new( &this->entries[0] ) value_type( last ); - this->n_entries = 1; - firstPos = this->entries[0].body->POS; - firstContig = this->entries[0].body->CHROM; - } - // Nothing carried over - else { - // Only set this to 0 if there is no carry - // over data from the previous cycle - for(std::size_t i = 0; i < this->n_entries; ++i) - ((this->entries + i)->~BCFEntry()); - - if(n_variants + 1 > this->capacity()){ - ::operator delete[](static_cast(this->entries)); - this->entries = static_cast(::operator new[]((n_variants + 1)*sizeof(value_type))); - this->n_capacity = n_variants + 1; - } - - //delete this->entries; - this->n_entries = 0; - } - - // Entries - this->n_carry_over = 0; - - // EOF - if(this->state == bcf_reader_state::BCF_EOF) - return false; - - U32 retrieved_variants = 0; - bool is_new = true; - - while(retrieved_variants < n_variants){ - //for(U32 i = 0; i < n_variants; ++i){ - if(this->current_pointer == this->bgzf_controller.buffer.size()){ - if(!this->nextBlock()){ - return(this->size() > 0); - } - } - - if(is_new) new( &this->entries[this->n_entries] ) value_type( this->header.samples * 2 ); - if(!this->nextVariant(this->entries[this->n_entries])){ - std::cerr << "failed to get next" << std::endl; - return false; - } - - - if(this->skip_invariant_sites && this->entries[this->n_entries].gt_support.invariant == true){ - //std::cerr << "not getting " << std::endl; - //((this->entries + this->n_entries)->~BCFEntry()); - //&this->entries[this->n_entries] = static_cast(::operator new[](sizeof(value_type))); - this->entries[this->n_entries].reset(); - is_new = false; - continue; - } - is_new = true; - ++retrieved_variants; - - // Check position - if(this->n_entries == 0){ - firstPos = this->entries[0].body->POS; - firstContig = this->entries[0].body->CHROM; - } - - // Make sure that data does not span over - // multiple CHROM fields - // Note: This property is maintainable only - // when the input file is sorted - if(!across_contigs){ - if(this->entries[this->n_entries].body->CHROM != firstContig){ - //std::cerr << utility::timestamp("LOG","CONTIG") << "Switch in CHROM: " << firstContig << "->" << this->entries[this->n_entries].body->CHROM << std::endl; - this->n_carry_over = 1; - return(this->size() > 0); - } - } - - // Check break condition for window - if(this->entries[this->n_entries].body->POS - firstPos > bp_window){ - ++this->n_entries; - break; - } - - // Increment entries in return block - ++this->n_entries; - } - - return(this->size() > 0); -} - -bool BCFReader::parseHeader(void){ - if(this->bgzf_controller.buffer.size() == 0){ - std::cerr << utility::timestamp("ERROR","BCF") << "No buffer!" << std::endl; - return false; - } - - if(this->bgzf_controller.buffer.size() < 5){ - std::cerr << utility::timestamp("ERROR","BCF") << "Corrupted header!" << std::endl; - return false; - } - - if(strncmp(&this->bgzf_controller.buffer.buffer[0], "BCF\2\2", 5) != 0){ - std::cerr << utility::timestamp("ERROR","BCF") << "Failed to validate MAGIC" << std::endl; - return false; - } - - const U32 l_text = *reinterpret_cast(&this->bgzf_controller.buffer[5]) + 4; - this->header_buffer.resize(l_text + 1); - - if(l_text - 5 < this->bgzf_controller.buffer.size()){ - this->header_buffer.Add(&this->bgzf_controller.buffer[5], l_text); - this->current_pointer = l_text + 5; - } else { - U32 head_read = this->bgzf_controller.buffer.size() - 5; - this->header_buffer.Add(&this->bgzf_controller.buffer[5], this->bgzf_controller.buffer.size() - 5); - - //U32 p = 0; - while(this->nextBlock()){ - if(head_read + this->bgzf_controller.buffer.size() >= l_text){ - this->header_buffer.Add(&this->bgzf_controller.buffer[0], l_text - head_read); - this->current_pointer = l_text - head_read; - break; - } - head_read += this->bgzf_controller.buffer.size(); - this->header_buffer.Add(&this->bgzf_controller.buffer[0], this->bgzf_controller.buffer.size()); - } - } - - if(!this->header.parse(&this->header_buffer[0], this->header_buffer.size())){ - std::cerr << utility::timestamp("ERROR","BCF") << "Failed to parse header!" << std::endl; - return false; - } - - return true; -} - -bool BCFReader::open(){ - if(this->file_name.size() == 0) - return false; - - this->stream.open(this->file_name, std::ios::binary | std::ios::in | std::ios::ate); - if(!this->stream.good()){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Failed to open file: " << this->file_name << std::endl; - return false; - } - - this->filesize = this->stream.tellg(); - this->stream.seekg(0); - - if(!this->stream.good()){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Bad stream!" << std::endl; - return false; - } - - if(!this->nextBlock()){ - std::cerr << utility::timestamp("ERROR","BCF") << "Failed to get first block!" << std::endl; - return false; - } - - if(!this->parseHeader()){ - std::cerr << utility::timestamp("ERROR","BCF") << "Failed to parse header!" << std::endl; - return false; - } - - return true; -} - -bool BCFReader::open(const std::string input){ - this->file_name = input; - return(this->open()); -} - -} -} diff --git a/lib/io/bcf/BCFReader.h b/lib/io/bcf/BCFReader.h deleted file mode 100644 index 7870f07..0000000 --- a/lib/io/bcf/BCFReader.h +++ /dev/null @@ -1,165 +0,0 @@ -#ifndef BCF_BCFREADER_H_ -#define BCF_BCFREADER_H_ - -#include - -#include "BCFEntry.h" -#include "io/compression/BGZFController.h" - - -namespace tachyon { -namespace bcf { - -class BCFReader{ - typedef BCFReader self_type; - typedef BCFEntry value_type; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef value_type* pointer; - typedef const value_type* const_pointer; - typedef std::ptrdiff_t difference_type; - typedef std::size_t size_type; - typedef io::BasicBuffer buffer_type; - typedef io::BGZFController bgzf_controller_type; - typedef vcf::VCFHeader header_type; - typedef core::HeaderContig contig_type; - -public: - enum bcf_reader_state{BCF_INIT, BCF_OK, BCF_ERROR, BCF_EOF, BCF_STREAM_ERROR}; - -public: - BCFReader(); - BCFReader(const std::string& file_name); - ~BCFReader(); - - class iterator{ - private: - typedef iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - reference operator*() const{ return *ptr_; } - pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - class const_iterator{ - private: - typedef const_iterator self_type; - typedef std::forward_iterator_tag iterator_category; - - public: - const_iterator(pointer ptr) : ptr_(ptr) { } - void operator++() { ptr_++; } - void operator++(int junk) { ptr_++; } - const_reference operator*() const{ return *ptr_; } - const_pointer operator->() const{ return ptr_; } - bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; } - bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; } - private: - pointer ptr_; - }; - - // Element access - inline reference at(const size_type& position){ return(this->entries[position]); } - inline const_reference at(const size_type& position) const{ return(this->entries[position]); } - inline reference operator[](const size_type& position){ return(this->entries[position]); } - inline const_reference operator[](const size_type& position) const{ return(this->entries[position]); } - inline pointer data(void){ return(this->entries); } - inline const_pointer data(void) const{ return(this->entries); } - inline reference front(void){ return(this->entries[0]); } - inline const_reference front(void) const{ return(this->entries[0]); } - inline reference back(void){ return(this->entries[this->n_entries - 1]); } - inline const_reference back(void) const{ return(this->entries[this->n_entries - 1]); } - - // Capacity - inline const bool empty(void) const{ return(this->n_entries == 0); } - inline const size_type& size(void) const{ return(this->n_entries); } - inline const size_type& capacity(void) const{ return(this->n_capacity); } - - // Iterator - inline iterator begin(){ return iterator(&this->entries[0]); } - inline iterator end() { return iterator(&this->entries[this->n_entries]); } - inline const_iterator begin() const{ return const_iterator(&this->entries[0]); } - inline const_iterator end() const{ return const_iterator(&this->entries[this->n_entries]); } - inline const_iterator cbegin() const{ return const_iterator(&this->entries[0]); } - inline const_iterator cend() const{ return const_iterator(&this->entries[this->n_entries]); } - - /**< - * Attempts to open a target input file. Internally - * checks if the input file is an actual BCF file and - * the first TGZF can be opened and the BCF header is - * valid. - * @param input Input target BCF file - * @return Returns TRUE upon success or FALSE otherwise - */ - bool open(const std::string input); - bool open(void); - - /**< - * Loads another TGZF block into memory - * @return Returns TRUE upon success or FALSE otherwise - */ - bool nextBlock(void); - - /**< - * Attempts to overload `entry` input BCFEntry with - * data - * @param entry Input BCFEntry that will be overloaded - * @return Returns TRUE upon success or FALSE otherwise - */ - bool nextVariant(reference entry); - - /**< - * Attempts to load either `n_variants` number of variants or - * variants covering >= `bp_window` base pairs. The function - * is successful whenever n_variants or bp_window is satisfied - * or if there is no more variants to load. - * @param n_variants Number of variants - * @param bp_window Non-overlapping window size in base-pairs - * @param across_contigs Allow the algorithm to span over two or more different chromosomes - * @return Returns TRUE upon success or FALSE otherwise - */ - bool getVariants(const U32 n_variants, const double bp_window, bool across_contigs = false); // get N number of variants into buffer - - inline const bool hasCarryOver(void) const{ return(this->n_carry_over); } - inline void setFilterInvariantSites(const bool yes){ this->skip_invariant_sites; } - -private: - /**< - * Parse the TGZF header of a block given - * the current buffer data loaded. - * Internal use only - * @return Returns TRUE on success or FALSE otherwise - */ - bool parseHeader(void); - -public: - std::string file_name; - std::ifstream stream; - U64 filesize; - U32 current_pointer; - S32 map_gt_id; - buffer_type buffer; - buffer_type header_buffer; - bgzf_controller_type bgzf_controller; - header_type header; - bcf_reader_state state; - size_type n_entries; - size_type n_capacity; - size_type n_carry_over; - pointer entries; - U64 b_data_read; - bool skip_invariant_sites; -}; - -} -} - -#endif /* BCF_BCFREADER_H_ */ diff --git a/lib/io/compression/BGZFController.cpp b/lib/io/compression/BGZFController.cpp deleted file mode 100644 index 19b6371..0000000 --- a/lib/io/compression/BGZFController.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include -#include - -#include "BGZFController.h" -#include "third_party/zlib/zconf.h" -#include "third_party/zlib/zlib.h" - -namespace tachyon { -namespace io { - - -BGZFController::BGZFController(){} - -BGZFController::BGZFController(const char* data, const U32 length){} - -BGZFController::~BGZFController(){ } - -void BGZFController::Clear(){ this->buffer.reset(); } - -U32 BGZFController::InflateSize(buffer_type& input) const{ - const header_type& header = *reinterpret_cast(&input.buffer[0]); - if(!header.Validate()){ - std::cerr << utility::timestamp("ERROR","BGZF") << "Invalid BGZF header" << std::endl; - std::cerr << utility::timestamp("DEBUG","BGZF") << "Output length: " << header.BSIZE << std::endl; - std::cerr << utility::timestamp("DEBUG","BGZF") << std::endl; - std::cerr << header << std::endl; - exit(1); - } - - return header.BSIZE; -} - -bool BGZFController::Inflate(buffer_type& input, buffer_type& output) const{ - const header_type& header = *reinterpret_cast(&input[0]); - if(!header.Validate()){ - std::cerr << utility::timestamp("ERROR","BGZF") << "Invalid BGZF header" << std::endl; - std::cerr << utility::timestamp("DEBUG","BGZF") << "Output length: " << header.BSIZE << std::endl; - std::cerr << utility::timestamp("DEBUG","BGZF") << std::endl; - std::cerr << header << std::endl; - exit(1); - } - - return(this->__Inflate(input, output, header)); -} - -bool BGZFController::Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{ - return(this->__Inflate(input, output, header)); -} - -bool BGZFController::__Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{ - const U32& uncompressedLength = *reinterpret_cast(&input.buffer[input.size() - sizeof(U32)]); - - if(output.size() + uncompressedLength >= output.capacity()) - output.resize((output.size() + uncompressedLength) + 65536); - - //U32* crc = reinterpret_cast(&input.buffer[input.size() - 2*sizeof(U32)]); - - // Bug fix for ZLIB when overflowing an U32 - U64 avail_out = output.capacity() - output.size(); - if(avail_out > std::numeric_limits::max()) - avail_out = std::numeric_limits::max(); - - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)&input.buffer[constants::BGZF_BLOCK_HEADER_LENGTH]; - zs.avail_in = (header.BSIZE + 1) - 16; - zs.next_out = (Bytef*)&output.buffer[output.n_chars]; - zs.avail_out = (U32)avail_out; - - int status = inflateInit2(&zs, constants::GZIP_WINDOW_BITS); - - if(status != Z_OK){ - std::cerr << utility::timestamp("ERROR","BGZF") << "Zlib inflateInit failed: " << (int)status << std::endl; - exit(1); - } - - // decompress - status = inflate(&zs, Z_FINISH); - if(status != Z_STREAM_END){ - inflateEnd(&zs); - std::cerr << utility::timestamp("ERROR","BGZF") << "Zlib inflateEnd failed: " << (int)status << std::endl; - exit(1); - } - - // finalize - status = inflateEnd(&zs); - if(status != Z_OK){ - inflateEnd(&zs); - std::cerr << utility::timestamp("ERROR","BGZF") << "Zlib inflateFinalize failed: " << (int)status << std::endl; - exit(1); - } - - //if(zs.total_out == 0) - // std::cerr << utility::timestamp("LOG", "BGZF") << "Detected empty BGZF block" << std::endl; - - output.n_chars += zs.total_out; - - return(true); -} - -bool BGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){ - input.resize(sizeof(header_type)); - stream.read(&input.buffer[0], io::constants::BGZF_BLOCK_HEADER_LENGTH); - if(!stream.good()){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Truncated file..." << std::endl; - return false; - } - - const header_type* h = reinterpret_cast(&input.buffer[0]); - input.n_chars = io::constants::BGZF_BLOCK_HEADER_LENGTH; - if(!h->Validate()){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Failed to validate!" << std::endl; - std::cerr << *h << std::endl; - return false; - } - - input.resize(h->BSIZE + 1); // make sure all data will fit - - // Recast because if buffer is resized then the pointer address is incorrect - // resulting in segfault - h = reinterpret_cast(&input.buffer[0]); - - stream.read(&input.buffer[io::constants::BGZF_BLOCK_HEADER_LENGTH], (h->BSIZE + 1) - io::constants::BGZF_BLOCK_HEADER_LENGTH); - if(!stream.good()){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Truncated file..." << std::endl; - return false; - } - - input.n_chars = h->BSIZE + 1; - const U32 uncompressed_size = *reinterpret_cast(&input[input.size() - sizeof(U32)]); - this->buffer.resize(uncompressed_size + 1); - this->buffer.reset(); - - if(!this->Inflate(input, this->buffer)){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Failed inflate!" << std::endl; - return false; - } - - // BGZF EOF marker - if(this->buffer.size() == 0) - return false; - - return true; -} - -} /* namespace IO */ -} /* namespace Tachyon */ diff --git a/lib/io/compression/BGZFController.h b/lib/io/compression/BGZFController.h deleted file mode 100644 index 2a64178..0000000 --- a/lib/io/compression/BGZFController.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef BGZFCONTROLLER_H_ -#define BGZFCONTROLLER_H_ - -#include "GZFHeader.h" -#include "support/helpers.h" -#include "third_party/zlib/zconf.h" -#include "third_party/zlib/zlib.h" -#include "io/basic_buffer.h" - -namespace tachyon { -namespace io { - -class BGZFController { - typedef BGZFController self_type; - typedef io::BasicBuffer buffer_type; - typedef BGZFHeader header_type; - - public: - BGZFController(); - BGZFController(const char* data, const U32 length); - ~BGZFController(); - - void Clear(); - bool Inflate(buffer_type& input, buffer_type& output, const header_type& header) const; - bool Inflate(buffer_type& input, buffer_type& output) const; - U32 InflateSize(buffer_type& input) const; - bool InflateBlock(std::ifstream& stream, buffer_type& input); - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream.write(entry.buffer.buffer, entry.buffer.size()); - return stream; - } - - private: - bool __Inflate(buffer_type& input, buffer_type& output, const header_type& header) const; - - public: - buffer_type buffer; - }; - -} /* namespace IO */ -} /* namespace Tomahawk */ - -#endif /* BGZFCONTROLLER_H_ */ diff --git a/lib/io/compression/GZFConstants.h b/lib/io/compression/GZFConstants.h deleted file mode 100644 index c51f66c..0000000 --- a/lib/io/compression/GZFConstants.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef GZFCONSTANTS_H_ -#define GZFCONSTANTS_H_ - -#include "support/type_definitions.h" - -namespace tachyon{ -namespace io{ -namespace constants{ - -// zlib & TGZF constants -const BYTE GZIP_ID1 = 31; -const BYTE GZIP_ID2 = 139; -const BYTE CM_DEFLATE = 8; -const BYTE FLG_FEXTRA = 4; -const BYTE OS_UNKNOWN = 255; -const BYTE TGZF_XLEN = 8; -const BYTE TGZF_ID1 = 84; -const BYTE TGZF_ID2 = 90; -const BYTE TGZF_LEN = 4; -const BYTE BGZF_XLEN = 6; -const BYTE BGZF_ID1 = 66; -const BYTE BGZF_ID2 = 67; -const BYTE BGZF_LEN = 2; - -const SBYTE GZIP_WINDOW_BITS = -15; -const SBYTE Z_DEFAULT_MEM_LEVEL = 8; -const BYTE TGZF_BLOCK_HEADER_LENGTH = 20; -const BYTE TGZF_BLOCK_FOOTER_LENGTH = 8; -const BYTE BGZF_BLOCK_HEADER_LENGTH = 18; - -} -} -} - -#endif /* GZFCONSTANTS_H_ */ diff --git a/lib/io/compression/GZFHeader.h b/lib/io/compression/GZFHeader.h deleted file mode 100644 index 1e55528..0000000 --- a/lib/io/compression/GZFHeader.h +++ /dev/null @@ -1,163 +0,0 @@ -#ifndef TGZFHEADER_H_ -#define TGZFHEADER_H_ - -#include "GZFConstants.h" - -namespace tachyon{ -namespace io{ - -#pragma pack(push, 1) -struct __attribute__((packed, aligned(1))) __headerBase{ -private: - typedef __headerBase self_type; - -public: - __headerBase(); - ~__headerBase(); - - BYTE ID1; - BYTE ID2; - BYTE CM; - BYTE FLG; - U32 MTIME; - BYTE XFL; - BYTE OS; - U16 XLEN; - BYTE SI1; - BYTE SI2; - U16 SLEN; - - friend std::ostream& operator<<(std::ostream& os, const self_type& header){ - os << "ID1\t" << (U32)header.ID1 << '\n'; - os << "ID2\t" << (U32)header.ID2 << '\n'; - os << "CM\t" << (U32)header.CM << '\n'; - os << "FLG\t" << (U32)header.FLG << '\n'; - os << "MTIME\t" << (S32)header.MTIME << '\n'; - os << "XFL\t" << (U32)header.XFL << '\n'; - os << "OS\t" << (U32)header.OS << '\n'; - os << "XLEN\t" << (U16)header.XLEN << '\n'; - os << "SI1\t" << (U32)header.SI1 << '\n'; - os << "SI2\t" << (U32)header.SI2 << '\n'; - os << "SLEN\t" << (U16)header.SLEN; - - return os; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - stream.read(reinterpret_cast(&header.ID1), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.ID2), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.CM), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.FLG), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.MTIME), sizeof(U32)); - stream.read(reinterpret_cast(&header.XFL), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.OS), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.XLEN), sizeof(U16)); - stream.read(reinterpret_cast(&header.SI1), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.SI2), sizeof(BYTE)); - stream.read(reinterpret_cast(&header.SLEN), sizeof(U16)); - return(stream); - } -}; - -/* - TGZF header - +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ - | 31|139| 8| 4| 0| 0|255| 6| 84| 90| 4| BLK_LEN| - +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ - BGZF extension: - ^ ^ ^ ^ - | | | | - FLG.EXTRA XLEN T Z - TGZF format is compatible with GZIP. It limits the size of each compressed - block to 2^32 bytes and adds and an extra "BC" field in the gzip header which - records the size. -*/ -struct __attribute__((packed, aligned(1))) TGZFHeader : public __headerBase{ -private: - typedef TGZFHeader self_type; - typedef __headerBase parent_type; - -public: - U32 BSIZE; // remainder size - - inline bool Validate(void) const{ - return(this->ID1 == constants::GZIP_ID1 - && this->ID2 == constants::GZIP_ID2 - && this->CM == constants::CM_DEFLATE - && this->FLG == constants::FLG_FEXTRA - && this->XLEN == constants::TGZF_XLEN - && this->SI1 == constants::TGZF_ID1 - && this->SI2 == constants::TGZF_ID2 - && this->SLEN == constants::TGZF_LEN - ); - } - - friend std::ostream& operator<<(std::ostream& os, const self_type& header){ - const parent_type& b = *reinterpret_cast(&header); - os << b << '\n'; - os << "BSIZE\t" << (U32)header.BSIZE; - return os; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - parent_type& b = *reinterpret_cast(&header); - stream >> b; - stream.read(reinterpret_cast(&header.BSIZE), sizeof(U32)); - return(stream); - } -}; - -/* - BGZF/GZIP header (specialised from RFC 1952; little endian): - +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ - | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| - +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ - BGZF extension: - ^ ^ ^ ^ - | | | | - FLG.EXTRA XLEN B C - BGZF format is compatible with GZIP. It limits the size of each compressed - block to 2^16 bytes and adds and an extra "BC" field in the gzip header which - records the size. -*/ -struct __attribute__((packed, aligned(1))) BGZFHeader : public __headerBase{ -private: - typedef BGZFHeader self_type; - typedef __headerBase parent_type; - -public: - U16 BSIZE; // remainder size - - inline bool Validate(void) const{ - return(this->ID1 == constants::GZIP_ID1 - && this->ID2 == constants::GZIP_ID2 - && this->CM == constants::CM_DEFLATE - && this->FLG == constants::FLG_FEXTRA - && this->XLEN == constants::BGZF_XLEN - && this->SI1 == constants::BGZF_ID1 - && this->SI2 == constants::BGZF_ID2 - && this->SLEN == constants::BGZF_LEN - ); - } - - friend std::ostream& operator<<(std::ostream& os, const self_type& header){ - const parent_type& b = *reinterpret_cast(&header); - os << b << '\n'; - os << "BSIZE\t" << (U16)header.BSIZE; - return os; - } - - friend std::istream& operator>>(std::istream& stream, self_type& header){ - parent_type& b = *reinterpret_cast(&header); - stream >> b; - stream.read(reinterpret_cast(&header.BSIZE), sizeof(U16)); - return(stream); - } -}; -#pragma pack(pop) - -} -} - - -#endif /* TGZFHEADER_H_ */ diff --git a/lib/io/compression/TGZFController.cpp b/lib/io/compression/TGZFController.cpp deleted file mode 100644 index 2d812c9..0000000 --- a/lib/io/compression/TGZFController.cpp +++ /dev/null @@ -1,248 +0,0 @@ -#include -#include -#include - -#include "TGZFController.h" -#include "GZFConstants.h" -#include "third_party/zlib/zconf.h" -#include "third_party/zlib/zlib.h" - -namespace tachyon { -namespace io { - -TGZFController::TGZFController() : compression_level(Z_DEFAULT_COMPRESSION), bit_window(constants::GZIP_WINDOW_BITS){} - -TGZFController::TGZFController(const char* data, const U32 length) : compression_level(Z_DEFAULT_COMPRESSION), bit_window(constants::GZIP_WINDOW_BITS){} - -TGZFController::TGZFController(const U32 largest_block_size) : compression_level(Z_DEFAULT_COMPRESSION), bit_window(constants::GZIP_WINDOW_BITS), buffer(largest_block_size){} - -TGZFController::~TGZFController(){ } - -void TGZFController::Clear(){ this->buffer.reset(); } - -bool TGZFController::Inflate(buffer_type& input, buffer_type& output) const{ - const header_type& header = *reinterpret_cast(&input[0]); - if(!header.Validate()){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Invalid TGZF header" << std::endl; - std::cerr << utility::timestamp("DEBUG","TGZF") << "Output length: " << header.BSIZE << std::endl; - std::cerr << utility::timestamp("DEBUG","TGZF") << std::endl; - std::cerr << header << std::endl; - exit(1); - } - - return(this->__Inflate(input, output, header)); -} - -bool TGZFController::Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{ - return(this->__Inflate(input, output, header)); -} - -bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{ - const U32& uncompressedLength = *reinterpret_cast(&input.buffer[input.size() - sizeof(U32)]); - if(output.size() + uncompressedLength >= output.capacity()) - output.resize((output.size() + uncompressedLength) + 65536); - - // Not used - //U32* crc = reinterpret_cast(&input.buffer[input.size() - 2*sizeof(U32)]); - - // Fix for ZLIB when overflowing an U32 - U64 avail_out = output.capacity() - output.size(); - if(avail_out > std::numeric_limits::max()) - avail_out = std::numeric_limits::max(); - - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)&input.buffer[constants::TGZF_BLOCK_HEADER_LENGTH]; - zs.avail_in = (header.BSIZE + 1) - 16; - zs.next_out = (Bytef*)&output.buffer[output.n_chars]; - zs.avail_out = (U32)avail_out; - - int status = inflateInit2(&zs, this->bit_window); - - if(status != Z_OK){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Zlib inflateInit failed: " << (int)status << std::endl; - exit(1); - } - - // decompress - status = inflate(&zs, Z_FINISH); - if(status != Z_STREAM_END){ - inflateEnd(&zs); - std::cerr << utility::timestamp("ERROR","TGZF") << "Zlib inflateEnd failed: " << (int)status << std::endl; - exit(1); - } - - // finalize - status = inflateEnd(&zs); - if(status != Z_OK){ - inflateEnd(&zs); - std::cerr << utility::timestamp("ERROR","TGZF") << "Zlib inflateFinalize failed: " << (int)status << std::endl; - exit(1); - } - - if(zs.total_out == 0) - std::cerr << utility::timestamp("LOG", "TGZF") << "Detected empty TGZF block" << std::endl; - - output.n_chars += zs.total_out; - - return(true); -} - -bool TGZFController::Deflate(const buffer_type& buffer){ - if(buffer.n_chars > std::numeric_limits::max()){ - std::cerr << utility::timestamp("ERROR", "TGZF") << "Format is limited to 2^32 bits. Buffer overflow..." << std::endl; - return(false); - } - - // Resize to fit - this->buffer.resize(buffer); - - memset(this->buffer.buffer, 0, constants::TGZF_BLOCK_HEADER_LENGTH); - - this->buffer[0] = constants::GZIP_ID1; - this->buffer[1] = constants::GZIP_ID2; - this->buffer[2] = constants::CM_DEFLATE; - this->buffer[3] = constants::FLG_FEXTRA; - this->buffer[9] = constants::OS_UNKNOWN; - this->buffer[10] = constants::TGZF_XLEN; - this->buffer[12] = constants::TGZF_ID1; - this->buffer[13] = constants::TGZF_ID2; - this->buffer[14] = constants::TGZF_LEN; - //buffer 16->20 is set below - - // set compression level - //const int compressionLevel = Z_DEFAULT_COMPRESSION; - //const int compressionLevel = 9; - - // initialize zstream values - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)buffer.buffer; - zs.avail_in = buffer.n_chars; - zs.next_out = (Bytef*)&this->buffer[constants::TGZF_BLOCK_HEADER_LENGTH]; - zs.avail_out = this->buffer.width - - constants::TGZF_BLOCK_HEADER_LENGTH - - constants::TGZF_BLOCK_FOOTER_LENGTH; - - // Initialise the zlib compression algorithm - int status = deflateInit2(&zs, - this->compression_level, - Z_DEFLATED, - this->bit_window, - constants::Z_DEFAULT_MEM_LEVEL, - Z_DEFAULT_STRATEGY); - - if ( status != Z_OK ){ - std::cerr << utility::timestamp("ERROR", "ZLIB") << "DeflateBlock: zlib deflateInit2 failed" << std::endl; - return false; - } - - // compress the data - status = deflate(&zs, Z_FINISH); - - // if not at stream end - if ( status != Z_STREAM_END ) { - deflateEnd(&zs); - - // there was not enough space available in buffer - std::cerr << utility::timestamp("ERROR", "ZLIB") << "DeflateBlock: zlib deflate failed (insufficient space)" << std::endl; - return false; - } - - // finalize the compression routine - status = deflateEnd(&zs); - if ( status != Z_OK ){ - std::cerr << utility::timestamp("ERROR", "ZLIB") << "DeflateBlock: zlib deflateEnd failed (not ok)" << std::endl; - return false; - } - - // update compressedLength - const U32 compressedLength = zs.total_out + - constants::TGZF_BLOCK_HEADER_LENGTH + - constants::TGZF_BLOCK_FOOTER_LENGTH; - - // store the compressed length - U32* test = reinterpret_cast(&this->buffer[16]); - *test = compressedLength; - //std::cerr << utility::timestamp("DEBUG") << data.pointer << "->" << compressedLength-1 << " stored: " << *test << std::endl; - - //std::time_t result = std::time(nullptr); - //std::asctime(std::localtime(&result)); - //U32* time = reinterpret_cast(&this->buffer[4]); - //*time = result; - //*time = 0; - //std::cerr << utility::timestamp("DEBUG") << "Time: " << *time << std::endl; - - memset(&buffer.buffer[compressedLength - constants::TGZF_BLOCK_FOOTER_LENGTH], 0, constants::TGZF_BLOCK_FOOTER_LENGTH); - - // store the CRC32 checksum - U32 crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)buffer.buffer, buffer.size()); - U32* c = reinterpret_cast(&this->buffer[compressedLength - constants::TGZF_BLOCK_FOOTER_LENGTH]); - *c = crc; - U32 convert = buffer.size(); // avoid potential problems when casting from U64 to U32 by interpretation - U32* uncompressed = reinterpret_cast(&this->buffer[compressedLength - sizeof(U32)]); - *uncompressed = convert; // Store uncompressed length - - this->buffer.n_chars = compressedLength; - //std::cerr << "Writing: " << convert << '/' << *uncompressed << '\t' << compressedLength << '\t' << *test << '\t' << buffer.size() << '\t' << "At pos: " << (compressedLength - sizeof(U32)) << '\t' << buffer.pointer << '\t' << *c << '\t' << convert << std::endl; - - return true; -} - -bool TGZFController::Deflate(buffer_type& meta, buffer_type& rle){ - meta += rle; - return(this->Deflate(meta)); -} - -bool TGZFController::Deflate(buffer_type& meta, buffer_type& meta_complex, buffer_type& rle){ - meta += rle; - meta += meta_complex; - return(this->Deflate(meta)); -} - -bool TGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){ - input.resize(sizeof(header_type)); - stream.read(&input.buffer[0], io::constants::TGZF_BLOCK_HEADER_LENGTH); - const header_type* h = reinterpret_cast(&input.buffer[0]); - input.n_chars = io::constants::TGZF_BLOCK_HEADER_LENGTH; - if(!h->Validate()){ - std::cerr << utility::timestamp("ERROR", "TGZF") << "Failed to validate!" << std::endl; - std::cerr << *h << std::endl; - return false; - } - - input.resize(h->BSIZE); // make sure all data will fit - - // Recast because if buffer is resized then the pointer address is incorrect - // resulting in segfault - h = reinterpret_cast(&input.buffer[0]); - - stream.read(&input.buffer[io::constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - io::constants::TGZF_BLOCK_HEADER_LENGTH); - if(!stream.good()){ - std::cerr << utility::timestamp("ERROR", "TGZF") << "Truncated file..." << std::endl; - return false; - } - - input.n_chars = h->BSIZE; - const U32 uncompressed_size = *reinterpret_cast(&input[input.size() - sizeof(U32)]); - this->buffer.resize(uncompressed_size); - this->buffer.reset(); - - if(!this->Inflate(input, this->buffer)){ - std::cerr << utility::timestamp("ERROR", "TGZF") << "Failed inflate!" << std::endl; - return false; - } - - // TGZF EOF marker - if(this->buffer.size() == 0) - return false; - - return true; -} - - -} -} diff --git a/lib/io/compression/TGZFController.h b/lib/io/compression/TGZFController.h deleted file mode 100644 index fb6faef..0000000 --- a/lib/io/compression/TGZFController.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef TGZFCONTROLLER_H_ -#define TGZFCONTROLLER_H_ - -#include - -#include "io/basic_buffer.h" -#include "GZFHeader.h" -#include "support/helpers.h" -#include "third_party/zlib/zconf.h" -#include "third_party/zlib/zlib.h" - -namespace tachyon{ -namespace io{ - -class TGZFController{ -private: - typedef TGZFController self_type; - -protected: - typedef io::BasicBuffer buffer_type; - typedef TGZFHeader header_type; - -public: - TGZFController(); - TGZFController(const char* data, const U32 length); - TGZFController(const U32 largest_block_size); - ~TGZFController(); - - void Clear(); - bool Inflate(buffer_type& input, buffer_type& output, const header_type& header) const; - bool Inflate(buffer_type& input, buffer_type& output) const; - bool InflateBlock(std::ifstream& stream, buffer_type& input); - - bool Deflate(const buffer_type& buffer); - bool Deflate(buffer_type& meta, buffer_type& rle); - bool Deflate(buffer_type& meta, buffer_type& meta_complex, buffer_type& rle); - inline void setWindowSize(const S32& window){ this->bit_window = window; } - inline void setCompression(const S32& compression){ this->compression_level = compression; } - - friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){ - stream.write(entry.buffer.buffer, entry.buffer.n_chars); - return stream; - } - -private: - bool __Inflate(buffer_type& input, buffer_type& output, const header_type& header) const; - -public: - S32 compression_level; - S32 bit_window; - buffer_type buffer; -}; - -} -} - -#endif /* TGZFCONTROLLER_H_ */ diff --git a/lib/io/compression/TGZFControllerStream.cpp b/lib/io/compression/TGZFControllerStream.cpp deleted file mode 100644 index 43bb900..0000000 --- a/lib/io/compression/TGZFControllerStream.cpp +++ /dev/null @@ -1,137 +0,0 @@ -#include - -#include "TGZFControllerStream.h" -#include "support/magic_constants.h" // for SILENT - -namespace tachyon{ -namespace io{ - -TGZFControllerStream::TGZFControllerStream() : STATE(TGZF_STATE::TGZF_INIT), chunk_size(65536), total_out(0), bytes_read(0), BSIZE(0){} -TGZFControllerStream::~TGZFControllerStream(){} - -bool TGZFControllerStream::InflateOpen(std::ifstream& stream){ - this->buffer.reset(); - this->buffer.resize(this->chunk_size); - this->bytes_read = 0; - stream.read(&this->buffer.buffer[0], io::constants::TGZF_BLOCK_HEADER_LENGTH); - const header_type* h = reinterpret_cast(&this->buffer.buffer[0]); - - if(!h->Validate()){ - std::cerr << utility::timestamp("ERROR", "TGZF") << "Failed to validate!" << std::endl; - std::cerr << *h << std::endl; - exit(1); - } - - this->BSIZE = h->BSIZE - constants::TGZF_BLOCK_HEADER_LENGTH - constants::TGZF_BLOCK_FOOTER_LENGTH; // data to read - this->total_out = 0; - - this->d_stream = z_stream(); - this->d_stream.zalloc = Z_NULL; - this->d_stream.zfree = Z_NULL; - this->d_stream.opaque = Z_NULL; - this->d_stream.avail_in = 0; - this->d_stream.next_in = Z_NULL; - this->STATE = TGZF_STATE::TGZF_HEADER; - - int ret = inflateInit2(&this->d_stream, constants::GZIP_WINDOW_BITS); - if (ret != Z_OK){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Failed inflatinit" << std::endl; - this->STATE = TGZF_STATE::TGZF_ERROR; - return ret; - } - - return true; -} - -bool TGZFControllerStream::Inflate(std::ifstream& stream, const BYTE* output, const U32& avail_out, U32& return_size){ - if(this->STATE == TGZF_INIT) - this->InflateOpen(stream); - - U32 avail_out_inner = 0; - U32 ret_inner = 0; - - while(this->__Inflate(stream, &output[avail_out_inner], avail_out - avail_out_inner, ret_inner)){ - return_size += ret_inner; - avail_out_inner += ret_inner; - } - - if(this->STATE == TGZF_STATE::TGZF_END) - return_size += ret_inner; - - if(return_size == 0) - return false; - - return true; -} - -bool TGZFControllerStream::__Inflate(std::ifstream& stream, const BYTE* output, const U32 avail_out, U32& return_size){ - // No space in output - if(avail_out == 0){ - //std::cerr << "RETURN no space" << std::endl; - return false; - } - - // No data available in buffer - // load some more - if(this->d_stream.avail_in == 0){ // and bytes read < BSIZE - this->buffer.reset(); - - U32 read_amount = this->chunk_size; - if(this->bytes_read + this->chunk_size > this->BSIZE) - read_amount = this->BSIZE - this->bytes_read; - - stream.read(&this->buffer.buffer[0], read_amount); - size_t total = stream.gcount(); - this->bytes_read += total; - - //std::cerr << "READ: " << total << "\t" << stream.tellg() << std::endl; - this->d_stream.avail_in = total; - this->d_stream.next_in = (Bytef*)&this->buffer.buffer[0]; - - if(total == 0){ - std::cerr << utility::timestamp("WARNING","TGZF") << "Nothing read!" << std::endl; - return false; - } - this->buffer.n_chars = total; - } - - const U32 tot_out = this->d_stream.total_out; - this->d_stream.next_out = (Bytef*)output; - this->d_stream.avail_out = avail_out; - - int status = inflate(&this->d_stream, Z_NO_FLUSH); - - assert(status != Z_STREAM_ERROR); - - if(status != Z_OK && status != Z_STREAM_END){ - std::cerr << utility::timestamp("ERROR","TGZF") << "inflate failed: " << (int)status << std::endl; - exit(1); - } - - return_size = this->d_stream.total_out - tot_out; // bytes inflated - this->total_out += return_size; - - if(status != Z_STREAM_END){ - this->STATE = TGZF_STATE::TGZF_OK; - return true; - } - - // otherwise its final - status = inflateEnd(&this->d_stream); - if(status != Z_OK){ - inflateEnd(&this->d_stream); - std::cerr << utility::timestamp("ERROR","TGZF") << "Zlib inflateFinalize failed: " << (int)status << std::endl; - exit(1); - } - - if(this->d_stream.total_out == 0){ - if(!SILENT) - std::cerr << utility::timestamp("LOG", "TGZF") << "Detected empty TGZF block" << std::endl; - } - - this->STATE = TGZF_STATE::TGZF_END; - return false; -} - -} -} diff --git a/lib/io/compression/TGZFControllerStream.h b/lib/io/compression/TGZFControllerStream.h deleted file mode 100644 index 332ad85..0000000 --- a/lib/io/compression/TGZFControllerStream.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef IO_TGZFCONTROLLERSTREAM_H_ -#define IO_TGZFCONTROLLERSTREAM_H_ - -#include "TGZFController.h" - -namespace tachyon { -namespace io { - -class TGZFControllerStream : public TGZFController{ -protected: - enum TGZF_STATE {TGZF_OK, TGZF_HEADER, TGZF_END, TGZF_INIT, TGZF_ERROR}; - -public: - TGZFControllerStream(); - ~TGZFControllerStream(); - - bool Inflate(std::ifstream& stream, const BYTE* output, const U32& avail_out, U32& return_size); - - void reset(void){ - this->total_out = 0; - this->bytes_read = 0; - this->BSIZE = 0; - this->STATE = TGZF_STATE::TGZF_INIT; - } - -protected: - bool InflateOpen(std::ifstream& stream); - bool __Inflate(std::ifstream& stream, const BYTE* output, const U32 avail_out, U32& return_size); - -protected: - TGZF_STATE STATE; - U32 chunk_size; - U32 total_out; - U32 bytes_read; - U32 BSIZE; - z_stream d_stream; -}; - -} -} - - - -#endif /* IO_TGZFCONTROLLERSTREAM_H_ */ diff --git a/lib/io/compression/TGZFEntryIterator.h b/lib/io/compression/TGZFEntryIterator.h deleted file mode 100644 index 5e805ee..0000000 --- a/lib/io/compression/TGZFEntryIterator.h +++ /dev/null @@ -1,131 +0,0 @@ -#ifndef IO_TGZFENTRYITERATOR_H_ -#define IO_TGZFENTRYITERATOR_H_ - -#include - -#include "TGZFController.h" -#include "TGZFControllerStream.h" - -namespace tachyon{ -namespace io{ - -template -class TGZFEntryIterator : public TGZFControllerStream{ - typedef TGZFControllerStream parent_type; - -public: - TGZFEntryIterator(std::ifstream& stream, const U32 n_entries); - TGZFEntryIterator(std::ifstream& stream, const U32 n_entries, const U64 from, const U64 to); - ~TGZFEntryIterator(); - - bool nextEntry(const T*& entry); - -//private: -// void reset(void){ this->pointer = 0; this->n_entries = 0; this->entries = nullptr; } - -private: - U32 pointer; - U32 n_entries; - U32 chunk_size; - // Offset must equal a TGZF boundary - // no checks are made - U64 IO_start_offset; // start TGZF block offset - U64 IO_end_offset; // end TGZF block offset - std::ifstream& stream; - buffer_type output_buffer; - const T* entries; -}; - -template -TGZFEntryIterator::TGZFEntryIterator(std::ifstream& stream, const U32 n_entries) : - pointer(0), - n_entries(0), - chunk_size(n_entries*sizeof(T)), - IO_start_offset(0), - IO_end_offset(std::numeric_limits::max()), - stream(stream), - output_buffer(n_entries*sizeof(T)), - entries(nullptr) -{} - -template -TGZFEntryIterator::TGZFEntryIterator(std::ifstream& stream, const U32 n_entries, const U64 from, const U64 to) : - pointer(0), - n_entries(0), - chunk_size(n_entries*sizeof(T)), - IO_start_offset(from), - IO_end_offset(to), - stream(stream), - output_buffer(n_entries*sizeof(T)), - entries(nullptr){} - -template -TGZFEntryIterator::~TGZFEntryIterator(){ this->output_buffer.deleteAll(); } - -template -bool TGZFEntryIterator::nextEntry(const T*& entry){ - if(this->pointer == this->n_entries){ - //check if allowed to proceed - if(this->STATE == TGZF_STATE::TGZF_END){ - this->stream.seekg(io::constants::TGZF_BLOCK_FOOTER_LENGTH, std::ios::cur); - - if(this->stream.tellg() == this->IO_end_offset) - return false; - - this->reset(); // reset state - } - - U32 ret_size = 0; - if(!parent_type::Inflate(this->stream, (BYTE*)&output_buffer.data[0], this->chunk_size, ret_size)){ - if(this->STATE != TGZF_STATE::TGZF_END){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Invalid state (" << this->STATE << ")" << std::endl; - exit(1); - } - } - - if(ret_size % sizeof(T) != 0){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Impossible: " << ret_size % sizeof(T) << '\t' << ret_size << '/' << this->chunk_size << '\t' << "state: " << this->STATE << " size: " << sizeof(T) << std::endl; - exit(1); - } - - if(ret_size == 0){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Returned nothing (state" << this->STATE << ")" << std::endl; - if(this->STATE == TGZF_STATE::TGZF_END){ - this->stream.seekg(io::constants::TGZF_BLOCK_FOOTER_LENGTH, std::ios::cur); - - if(this->stream.tellg() == this->IO_end_offset) - return false; - - this->reset(); // reset state - } - - if(!parent_type::Inflate(this->stream, (BYTE*)&output_buffer.data[0], this->chunk_size, ret_size)){ - if(this->STATE != TGZF_STATE::TGZF_END){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Invalid state (" << this->STATE << ")" << std::endl; - exit(1); - } - } - - if(ret_size == 0){ - std::cerr << utility::timestamp("ERROR","TGZF") << "Impossible" << std::endl; - exit(1); - } - - } - - this->output_buffer.pointer = ret_size; - this->n_entries = ret_size / sizeof(T); - this->pointer = 0; - this->entries = reinterpret_cast(this->output_buffer.data); - } - - entry = &this->entries[this->pointer++]; - return true; -} - -} -} - - - -#endif /* IO_TGZFENTRYITERATOR_H_ */ diff --git a/lib/io/vcf/VCFHeader.cpp b/lib/io/vcf/VCFHeader.cpp deleted file mode 100644 index e548687..0000000 --- a/lib/io/vcf/VCFHeader.cpp +++ /dev/null @@ -1,455 +0,0 @@ -#include "VCFHeader.h" - -namespace tachyon { -namespace vcf{ - -VCFHeader::VCFHeader() : - error_bit(VCF_PASS), - samples(0), - version(0), - info_remap(nullptr), - format_remap(nullptr), - filter_remap(nullptr), - contigsHashTable(nullptr), - sampleHashTable(nullptr) -{ -} - -VCFHeader::~VCFHeader(){ - delete this->contigsHashTable; - delete this->sampleHashTable; - delete [] this->info_remap; - delete [] this->format_remap; - delete [] this->filter_remap; -} - -bool VCFHeader::parse(reader_type& stream){ - if(!this->parseFirstLine(stream)) - return false; - - // Read remainder lines - if(!this->parseHeaderLines(stream)) - return false; - - if(!this->buildContigTable()) - return false; - - // Copy string literal to header - U32 curPos = stream.tellg(); - U32 headerLength = curPos - stream.size(); - this->literal.resize(headerLength); - stream.stream_.seekg(0); - stream.stream_.read(&this->literal[0], headerLength); - stream.stream_.seekg(curPos); - - // Read samples line - if(!this->parseSampleLine(stream)) - return false; - - return true; -} - -bool VCFHeader::parse(const char* const data, const U32& length){ - U32 offset = 0; - if(!this->parseFirstLine(data, offset)) - return false; - - // Read remainder lines - if(!this->parseHeaderLines(data, offset)) - return false; - - if(!this->buildContigTable()) - return false; - - // Read samples line - if(!this->parseSampleLine(data, offset, length)) - return false; - - /**< - * Store IDX values from BCF in vectors - */ - for(U32 i = 0 ; i < this->lines.size(); ++i){ - if(this->lines[i].isIndexable == false) - continue; - - S32 idx = -1; - std::string type; - for(U32 j = 0; j < this->lines[i].pairs.size(); ++j){ - //std::cerr << j << ':' << this->lines[i].pairs[j].KEY << "=" << this->lines[i].pairs[j].VALUE << std::endl; - if(this->lines[i].pairs[j].KEY == "IDX") - idx = atoi(&this->lines[i].pairs[j].VALUE[0]); - - if(this->lines[i].type == vcf::TACHYON_VCF_HEADER_LINE_TYPE::YON_VCF_HEADER_INFO || - this->lines[i].type == vcf::TACHYON_VCF_HEADER_LINE_TYPE::YON_VCF_HEADER_FORMAT){ - - if(this->lines[i].pairs[j].KEY == "Type") - type = this->lines[i].pairs[j].VALUE; - } - } - assert(idx != -1); - //std::cerr << type << std::endl; - - // Push IDX into correct vector family - if(this->lines[i].type == vcf::TACHYON_VCF_HEADER_LINE_TYPE::YON_VCF_HEADER_INFO){ - // Valid INFO = Integer, Float, Flag, Character, and String - S32 primitive_type = -1; - if(type == "Integer") primitive_type = 0; - else if(type == "Float") primitive_type = 1; - else if(type == "Flag") primitive_type = 2; - else if(type == "Character") primitive_type = 3; - else if(type == "String") primitive_type = 4; - assert(primitive_type != -1); - - this->info_map.push_back(map_entry_type(this->lines[i].pairs[0].VALUE, idx, primitive_type)); - } else if(this->lines[i].type == vcf::TACHYON_VCF_HEADER_LINE_TYPE::YON_VCF_HEADER_FILTER){ - this->filter_map.push_back(map_entry_type(this->lines[i].pairs[0].VALUE, idx)); - } else if(this->lines[i].type == vcf::TACHYON_VCF_HEADER_LINE_TYPE::YON_VCF_HEADER_FORMAT){ - // Valid FORMAT = Integer, Float, Character, and String - S32 primitive_type = -1; - if(type == "Integer") primitive_type = 0; - else if(type == "Float") primitive_type = 1; - else if(type == "Character") primitive_type = 3; - else if(type == "String") primitive_type = 4; - assert(primitive_type != -1); - - this->format_map.push_back(map_entry_type(this->lines[i].pairs[0].VALUE, idx, primitive_type)); - } else { - std::cerr << "illegal format" << std::endl; - exit(1); - } - } - - // Sort data - std::sort(this->info_map.begin(), this->info_map.end()); - std::sort(this->filter_map.begin(), this->filter_map.end()); - std::sort(this->format_map.begin(), this->format_map.end()); - - if(this->info_map.size()){ - const S32 largest_idx = this->info_map.back().IDX; - this->info_remap = new S32[largest_idx + 1]; - U32 new_idx = 0; - for(U32 i = 0; i < this->info_map.size(); ++i){ - this->info_remap[this->info_map[i].IDX] = new_idx; - ++new_idx; - } - } - - if(this->format_map.size()){ - const S32 largest_idx = this->format_map.back().IDX; - this->format_remap = new S32[largest_idx + 1]; - U32 new_idx = 0; - for(U32 i = 0; i < this->format_map.size(); ++i){ - this->format_remap[this->format_map[i].IDX] = new_idx++; - } - } - - if(this->filter_map.size()){ - const S32 largest_idx = this->filter_map.back().IDX; - this->filter_remap = new S32[largest_idx + 1]; - U32 new_idx = 0; - for(U32 i = 0; i < this->filter_map.size(); ++i){ - this->filter_remap[this->filter_map[i].IDX] = new_idx++; - } - } - - return true; -} - -void VCFHeader::buildSampleTable(U64 samples){ - this->samples = samples; - delete this->sampleHashTable; - - if(this->samples*2 < 1024) - this->sampleHashTable = new hash_table_type(1024); - else - this->sampleHashTable = new hash_table_type(this->samples * 2); -} - -bool VCFHeader::checkLine(const char* data, const U32 length){ - header_line_type line(data, length); - if(line.Parse()){ - // If the line is a contig line: make sure it is legal - // for our purposes - if(line.isCONTIG()){ - contig_type contig; - BYTE found = 0; - - // Contig line has two values: - // ID: contig name - // length: for length is bp - for(U32 i = 0; i < line.size(); ++i){ - if(strncasecmp(&line[i].KEY[0], "ID", 2) == 0 && line[i].KEY.size() == 2){ - contig.name = line[i].VALUE; - ++found; - } else if(strncasecmp(&line[i].KEY[0], "length", 6) == 0 && line[i].KEY.size() == 6){ - contig.bp_length = atoi(&line[i].VALUE[0]); - ++found; - } - } - - // Throw error if this pattern is not found - if(found != 2){ - std::cerr << utility::timestamp("WARNING","VCF") << "Illegal contig entry line with no length defined!" << std::endl; - std::cerr << utility::timestamp("WARNING","VCF") << "Offending line: " << std::string(data, length+1) << std::endl; - contig.bp_length = std::numeric_limits::max(); - } - this->contigs.push_back(contig); - } - - // Also store the line as an object - // and as a literal string as observed - // in the VCF header - this->lines.push_back(line); // parseable lines - this->literal_lines.push_back(std::string(data, length + 1)); - return true; - } - - std::cerr << utility::timestamp("ERROR","VCF") << "Failed to parse VCF..." << std::endl; - return false; -} - -bool VCFHeader::buildContigTable(void){ - S32* retValue; - - delete this->contigsHashTable; - - if(this->contigs.size() < 1024) - this->contigsHashTable = new hash_table_type(1024); - else - this->contigsHashTable = new hash_table_type(this->contigs.size() * 2); - - if(!SILENT) - std::cerr << utility::timestamp("LOG", "VCF") << "Constructing lookup table for " << this->contigs.size() << " contigs..." << std::endl; - - for(U32 i = 0; i < this->contigs.size(); ++i){ - if(!(*this).getContig(this->contigs[i].name, retValue)){ - (*this).addContig(this->contigs[i].name, i); - } else { - std::cerr << utility::timestamp("ERROR", "VCF") << "Duplicated contig found (" << this->getContig(*retValue).name << "). Illegal..." << std::endl; - this->error_bit = VCF_ERROR_LINES; - return false; - } - } - return true; -} - -bool VCFHeader::parseFirstLine(reader_type& stream){ - if(!stream.good()){ - this->error_bit = STREAM_BAD; - return false; - } - - if(!stream.getLine()){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Could not validate file..." << std::endl; - this->error_bit = STREAM_BAD; - return false; - } - - // Parse - if(strncmp(&stream[0], &vcf::constants::HEADER_VCF_FORMAT[0], vcf::constants::HEADER_VCF_FORMAT.size()) != 0){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Invalid VCF format..." << std::endl; - this->error_bit = VCF_ERROR_LINE1; - return false; - } - - if(strncmp(&stream[0], &vcf::constants::HEADER_VCF_VERSION[0], vcf::constants::HEADER_VCF_VERSION.size()) != 0){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Invalid VCF version < 4.x..." << std::endl; - this->error_bit = VCF_ERROR_LINE1; - return false; - } - - this->literal_lines.push_back(std::string(&stream[0],stream.size()-1)); - stream.clear(); - return true; -} - -bool VCFHeader::parseFirstLine(const char* const data, U32& offset){ - offset = 4; - if(strncmp(&data[offset], &vcf::constants::HEADER_VCF_FORMAT[0], vcf::constants::HEADER_VCF_FORMAT.size()) != 0){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Invalid VCF format..." << std::endl; - std::cerr << std::string(&data[offset], 100) << std::endl; - this->error_bit = VCF_ERROR_LINE1; - return false; - } - - if(strncmp(&data[offset], &vcf::constants::HEADER_VCF_VERSION[0], vcf::constants::HEADER_VCF_VERSION.size()) != 0){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Invalid VCF version < 4.x..." << std::endl; - this->error_bit = VCF_ERROR_LINE1; - return false; - } - - const char* hit = std::strchr(&data[offset], '\n'); - this->literal_lines.push_back(std::string(&data[offset], hit - &data[offset])); - offset += (hit + 1) - &data[offset]; - return true; -} - -bool VCFHeader::parseHeaderLines(reader_type& stream){ - while(stream.getLine()){ - if(stream.buffer_[1] != '#') - break; - - if(!this->checkLine(stream.buffer_, stream.size() - 2)){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Failed to validate header lines" << std::endl; - this->error_bit = VCF_ERROR_LINES; - return false; - } - - stream.clear(); - } - - if(!stream.good()){ - this->error_bit = STREAM_BAD; - return false; - } - - return true; -} - -bool VCFHeader::parseHeaderLines(const char* const data, U32& offset){ - std::istringstream is(&data[offset]); - std::string line; - while(std::getline(is, line)){ - if(line[1] != '#') - break; - - if(!this->checkLine(&line[0], line.size() - 1)){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Failed to validate header lines" << std::endl; - this->error_bit = VCF_ERROR_LINES; - return false; - } - } - - offset += (U32)is.tellg() - line.size() - 1; - - return true; -} - -bool VCFHeader::parseSampleLine(reader_type& stream){ - // At broken position is main header line - // Validate header - if(strncmp(&vcf::constants::HEADER_COLUMN[0], &stream.buffer_[0], vcf::constants::HEADER_COLUMN.size()) != 0){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Could not validate header line" << std::endl; - this->error_bit = VCF_ERROR_SAMPLE; - return false; - } - - U32 search_position = vcf::constants::HEADER_COLUMN.size() + 1; - U64 delimiters_found = 0; - while(true){ // while there is samples in line - char* found = std::find(&stream[search_position], &stream[stream.size()-1], vcf::constants::VCF_DELIMITER); - if(*found != vcf::constants::VCF_DELIMITER) - break; - - //std::cerr << std::string(&stream[search_position], (found - stream.buffer_ + 1) - search_position) << std::endl; - search_position = found - stream.buffer_ + 1; - ++delimiters_found; - } - - this->buildSampleTable(delimiters_found); - - // Parse - search_position = vcf::constants::HEADER_COLUMN.size() + 1; - delimiters_found = 0; - S32* retValue; - char* found = 0; - while(found != &stream[stream.size()-1]){ // while there are samples in line - found = std::find(&stream[search_position], &stream[stream.size()-1], vcf::constants::VCF_DELIMITER); - std::string sampleName(&stream[search_position], (found - stream.buffer_ + 1) - search_position - 1); - - if(sampleName == "FORMAT"){ - search_position = found - stream.buffer_ + 1; - continue; - } - - if(!this->getSample(sampleName, retValue)) this->addSample(sampleName); - else { - std::cerr << utility::timestamp("ERROR", "VCF") << "Duplicated sample name in header..." << std::endl; - this->error_bit = VCF_ERROR_LINES; - } - - search_position = found - stream.buffer_ + 1; - } - - stream.clear(); - return true; -} - -bool VCFHeader::parseSampleLine(const char* const data, U32& offset, const U32& length){ - // At broken position is main header line - // Validate header - if(strncmp(&vcf::constants::HEADER_COLUMN[0], &data[offset], vcf::constants::HEADER_COLUMN.size()) != 0){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Could not validate header line" << std::endl; - this->error_bit = VCF_ERROR_SAMPLE; - return false; - } - - if(offset+vcf::constants::HEADER_COLUMN.size()+2 == length){ - //std::cerr << "no samples" << std::endl; - this->buildSampleTable(0); - return true; - } - - offset += vcf::constants::HEADER_COLUMN.size() + 1; - U64 delimiters_found = 0; - U32 offset_original = offset; - - while(true){ // while there is samples in line - const char* const found = std::strchr(&data[offset], vcf::constants::VCF_DELIMITER); - //std::cerr << (void*)found << '\t' << (void*)&data[length] << std::endl; - if(found == 0 || (*found != vcf::constants::VCF_DELIMITER)){ - std::string sampleName(&data[offset], (&data[length - 1] - &data[offset]) - 1); // -2 because offset is +1 and newline is +1 - //std::cerr << sampleName << std::endl; - ++delimiters_found; - break; - } - - std::string sampleName(&data[offset], (found - &data[offset])); - if(sampleName == "FORMAT"){ - offset += found - &data[offset] + 1; - continue; - } - - offset += found - &data[offset] + 1; - ++delimiters_found; - } - - this->buildSampleTable(delimiters_found); - - offset = offset_original; - S32* retValue; - while(true){ // while there is samples in line - const char* const found = std::strchr(&data[offset], vcf::constants::VCF_DELIMITER); - if(found == 0 || (*found != vcf::constants::VCF_DELIMITER)){ - std::string sampleName(&data[offset], (&data[length - 1] - &data[offset]) - 1); // -2 because offset is +1 and newline is +1 - if(!this->getSample(sampleName, retValue)) - this->addSample(sampleName); - else { - std::cerr << utility::timestamp("ERROR", "VCF") << "Duplicated sample name in header..." << std::endl; - this->error_bit = VCF_ERROR_LINES; - } - break; - } - - - std::string sampleName(&data[offset], (found - &data[offset])); - if(sampleName == "FORMAT"){ - offset += found - &data[offset] + 1; - continue; - } - - if(!this->getSample(sampleName, retValue)) - this->addSample(sampleName); - else { - std::cerr << utility::timestamp("ERROR", "VCF") << "Duplicated sample name in header..." << std::endl; - this->error_bit = VCF_ERROR_LINES; - } - offset += found - &data[offset] + 1; - } - - return true; -} - - -} -} /* namespace Tachyon */ diff --git a/lib/io/vcf/VCFHeader.h b/lib/io/vcf/VCFHeader.h deleted file mode 100644 index 1a0ac80..0000000 --- a/lib/io/vcf/VCFHeader.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef VCF_VCFHEADER_H_ -#define VCF_VCFHEADER_H_ - -#include - -#include "algorithm/OpenHashTable.h" -#include "core/header/header_contig.h" -#include "core/header/header_map_entry.h" -#include "core/header/header_sample.h" -#include "io/BasicReader.h" -#include "VCFHeaderConstants.h" -#include "VCFHeaderLine.h" -#include "support/helpers.h" -#include "io/basic_buffer.h" - -namespace tachyon { -namespace vcf{ - -class VCFHeader { - typedef VCFHeader self_type; - typedef hash::HashTable hash_table_type; - typedef core::HeaderContig contig_type; - typedef io::BasicBuffer buffer_type; - typedef VCFHeaderLine header_line_type; - typedef core::HeaderMapEntry map_entry_type; - typedef core::HeaderSample header_sample_type; - typedef io::BasicReader reader_type; - - enum VCF_ERROR_TYPE {VCF_PASS, VCF_ERROR_LINE1, VCF_ERROR_LINES, VCF_ERROR_SAMPLE, STREAM_BAD}; - -public: - VCFHeader(); - ~VCFHeader(); - - inline bool good(void) const{ return(this->error_bit == VCF_PASS); } - inline bool valid(void) const{ return(this->version > 0); } - inline void setVersion(float version){ this->version = version; } - inline const float& getVersion(void) const{ return(this->version); } - inline U32 getContigs(void) const{ return this->contigs.size(); } - //inline const contig_type& operator[](const U32 p) const{ return(this->contigs[p]); } - inline contig_type& getContig(const U32 p){ return this->contigs[p]; } - inline U32 getLines(void) const{ return this->lines.size(); } - inline const U64& size(void) const{ return this->samples; } - - inline bool getContig(const std::string& contig, S32*& retValue) const{ - return(this->contigsHashTable->GetItem(&contig[0], &contig, retValue, contig.size())); - } - - inline bool getSample(const std::string& sample, S32*& retValue) const{ - return(this->sampleHashTable->GetItem(&sample[0], &sample, retValue, sample.size())); - } - - bool parse(reader_type& stream); - bool parse(const char* const data, const U32& length); - -private: - // These functions are unsafe as they require contigHashTable to be - // set prior to calling - // no tests are made to check - inline void addContig(const std::string& contig, U32 value){ - this->contigsHashTable->SetItem(&contig[0], &contig, value, contig.size()); - } - - inline void addSample(const std::string& sample){ - this->sampleNames.push_back(header_sample_type(sample)); - this->sampleHashTable->SetItem(&sample[0], &sample, this->sampleNames.size()-1, sample.size()); - } - - // Internal overload helpers - bool checkLine(const char* data, const U32 length); - bool buildContigTable(void); - void buildSampleTable(U64 samples); - bool parseFirstLine(reader_type& stream); - bool parseHeaderLines(reader_type& stream); - bool parseSampleLine(reader_type& stream); - bool parseFirstLine(const char* const data, U32& offset); - bool parseHeaderLines(const char* const data, U32& offset); - bool parseSampleLine(const char* const data, U32& offset, const U32& length); - -public: - VCF_ERROR_TYPE error_bit; // parse error bit - U64 samples; // number of samples - float version; // VCF version - std::string literal; // string copy of header data - // Contigs are written to disk as an - // object - std::vector contigs; // contigs - // Sample names are written to disk as - // U32 l_name; char[l_name] - std::vector sampleNames; // sample names - std::vector lines; // header lines - std::vector literal_lines; // vcf line literals - // These entries are read from disk as - // U32 l_name; BYTE type; char[l_name] - //std::vector map; - std::vector info_map; - std::vector format_map; - std::vector filter_map; - // Constructed during run-time - S32* info_remap; // map from IDX to local id for O(1) lookup - S32* format_remap; - S32* filter_remap; - - hash_table_type* contigsHashTable; // hash table for contig names - hash_table_type* sampleHashTable; // hash table for sample names -}; - -} -} /* namespace Tomahawk */ - -#endif /* VCFHEADER_H_ */ diff --git a/lib/io/vcf/VCFHeaderConstants.h b/lib/io/vcf/VCFHeaderConstants.h deleted file mode 100644 index 0e4b946..0000000 --- a/lib/io/vcf/VCFHeaderConstants.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef VCF_VCFHEADERCONSTANTS_H_ -#define VCF_VCFHEADERCONSTANTS_H_ - -#include -#include "support/type_definitions.h" - -namespace tachyon{ -namespace vcf{ -namespace constants{ - -const std::string HEADER_COLUMN = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; -const char VCF_DELIMITER = '\t'; - -const std::string HEADER_VCF_FORMAT = "##fileformat=VCFv"; -const std::string HEADER_VCF_VERSION = "##fileformat=VCFv4"; -const std::string HEADER_FILEFORMAT = "##fileformat="; -const std::string HEADER_CONTIG = "##contig="; -const std::string HEADER_ALT = "##alt="; -const std::string HEADER_INFO = "##info="; -const std::string HEADER_FILTER = "##filter="; -const std::string HEADER_FORMAT = "##format="; - -} -} -} - - -#endif /* VCFHEADERCONSTANTS_H_ */ diff --git a/lib/io/vcf/VCFHeaderLine.h b/lib/io/vcf/VCFHeaderLine.h deleted file mode 100644 index 49249dc..0000000 --- a/lib/io/vcf/VCFHeaderLine.h +++ /dev/null @@ -1,169 +0,0 @@ -#ifndef VCF_VCFHEADERLINE_H_ -#define VCF_VCFHEADERLINE_H_ - -#include -#include -#include // for std::find -#include -#include - -#include "VCFHeaderConstants.h" -#include "support/helpers.h" - -namespace tachyon{ -namespace vcf{ - -enum TACHYON_VCF_HEADER_LINE_TYPE{ - YON_VCF_HEADER_UNKNOWN, - YON_VCF_HEADER_FORMAT, - YON_VCF_HEADER_FILTER, - YON_VCF_HEADER_INFO, - YON_VCF_HEADER_CONTIG -}; - -struct VCFHeaderLine{ -private: - typedef VCFHeaderLine self_type; - - // Internal helper struct - struct VCFHeaderLineKeyValue{ - typedef VCFHeaderLineKeyValue self_type; - - VCFHeaderLineKeyValue(const std::string& key, const std::string& value) : KEY(key), VALUE(value){} - VCFHeaderLineKeyValue(){} - ~VCFHeaderLineKeyValue(){} - - friend std::ostream& operator<<(std::ostream& out, const self_type& pair){ - out << pair.KEY << '\t' << pair.VALUE; - return(out); - } - - std::string KEY; - std::string VALUE; - }; - typedef VCFHeaderLineKeyValue key_value; - -public: - VCFHeaderLine(const char* data, const U32 size) : type(YON_VCF_HEADER_UNKNOWN), isIndexable(false), size_(size), data(data){} - ~VCFHeaderLine(){} - - inline const U32 size(void) const{ return this->pairs.size(); } - inline const key_value& operator[](const U32 p) const{ return this->pairs[p]; } - inline bool isValid(void) const{ return(this->size_ > 2 && (this->data[0] == '#' && this->data[1] == '#')); } - inline bool isCONTIG(void) const{ - return(strncasecmp(&constants::HEADER_CONTIG[0], &this->data[0], constants::HEADER_CONTIG.size()) == 0); - } - - friend std::ostream& operator<<(std::ostream& out, const self_type& pair){ - out << pair.data << '\n'; - for(U32 i = 0; i < pair.pairs.size(); ++i) - out << i << '/' << pair.size() << '\t' << pair[i] << '\n'; - - return(out); - } - - bool Parse(void){ - // Make sure this is a valid VCF header line - // Rule: has to start with ## - if(!this->isValid()){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Invalid VCF header line..." << std::endl; - return false; - } - - // Attempt to find an equal sign - const char* match = std::find(this->data, &this->data[this->size_], '='); - if(*match != '='){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Corrupted VCF header entry: no equal match..." << std::endl; - return false; - } - - if(this->data[match - this->data + 1] != '<'){ - this->isIndexable = false; - return true; - } - - //std::string(this->data + 2, match - this->data - 2); - if(strncasecmp(this->data + 2, "FORMAT", match - this->data - 2) == 0){ - this->type = YON_VCF_HEADER_FORMAT; - this->isIndexable = true; - } else if(strncasecmp(this->data + 2, "FILTER", match - this->data - 2) == 0){ - this->type = YON_VCF_HEADER_FILTER; - this->isIndexable = true; - } else if(strncasecmp(this->data + 2, "INFO", match - this->data - 2) == 0){ - this->type = YON_VCF_HEADER_INFO; - this->isIndexable = true; - } - - U32 matchPos = match - this->data + 1; - if(this->data[matchPos] == '<'){ - if(this->data[this->size_] != '>'){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Corrupted VCF header entry: " << this->data[this->size_] << std::endl; - return false; - } - - ++matchPos; - - // Sweep over and assert it is valid - while(this->nextKey(matchPos)){ - // nothing in body - } - - } - - return true; - } - -private: - bool nextKey(U32& startPos){ - if(this->data[startPos] == '>') - return false; - - const char* match = std::find(&this->data[startPos], &this->data[this->size_], '='); - if(*match != '='){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Corrupted VCF header entry: no equal match in next key..." << std::endl; - return false; - } - U32 matchPos = match - this->data; - key_value entry; - entry.KEY = std::string(&this->data[startPos], matchPos - startPos); - - startPos = matchPos + 1; - - char match_token = ','; - BYTE adjust_value = 0; - if(this->data[startPos] == '"'){ - match_token = '"'; - adjust_value = 1; - } - - match = std::find(&this->data[startPos + adjust_value], &this->data[this->size_], match_token); - if(*match == '>'){ - entry.VALUE = std::string(&this->data[startPos],this->size_ - startPos); - startPos = matchPos + 1; - this->pairs.push_back(entry); - return false; - } else if(*match != match_token){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Corrupted VCF header entry: no comma match in next key..." << std::endl; - return false; - } - - matchPos = match - this->data; - entry.VALUE = std::string(&this->data[startPos], matchPos - startPos + adjust_value); - startPos = matchPos + 1; - if(this->data[startPos] == ',') ++startPos; - this->pairs.push_back(entry); - return true; - } - -public: - TACHYON_VCF_HEADER_LINE_TYPE type; - bool isIndexable; // if this record should form part of the primary map - U32 size_; // size in bytes - const char* data; // pointer to data - std::vector pairs; // key-value pairs -}; - -} -} - -#endif /* VCF_VCFHEADERLINE_H_ */ diff --git a/lib/io/vcf/VCFLines.h b/lib/io/vcf/VCFLines.h deleted file mode 100644 index d459f02..0000000 --- a/lib/io/vcf/VCFLines.h +++ /dev/null @@ -1,300 +0,0 @@ -#ifndef VCFLINES_H_ -#define VCFLINES_H_ - -#include -#include -#include -#include -#include -#include - -#include "io/vcf/VCFHeaderConstants.h" -#include "support/helpers.h" -#include "support/MagicConstants.h" - -namespace tachyon{ -namespace vcf{ - -// -#pragma pack(push, 1) -struct __attribute__((packed, aligned(1))) VCFDiploidGenotype{ -public: - VCFDiploidGenotype(); // Has no ctor or dtor - ~VCFDiploidGenotype(); - - bool hasMissing(void) const{return(this->snpA == '.' || this->snpB == ','); } - - char snpA; - char separator; - char snpB; - char spacer; - - friend std::ostream& operator<<(std::ostream& stream, const VCFDiploidGenotype& entry){ - stream << entry.snpA << entry.separator << entry.snpB; - return stream; - } -}; -#pragma pack(pop) - -class VCFLineDataInterface{ -public: - VCFLineDataInterface(const U32 samples) : samples_(samples), dataLength_(0), inputData_(nullptr){} - virtual ~VCFLineDataInterface(){} - virtual bool Parse(void) =0; - void SetData(const char* data, const U32 dataLength){ this->inputData_ = data; this->dataLength_ = dataLength; } - //virtual const VCFDiploidGenotype& operator[](const U32 position) const =0; - const U32& size(void) const{ return this->samples_; } - -protected: - const U32 samples_; - U32 dataLength_; - const char* inputData_; -}; - -class VCFLineDataSimple : public VCFLineDataInterface{ -public: - VCFLineDataSimple(const U32 samples) : VCFLineDataInterface(samples), data_(nullptr){} - ~VCFLineDataSimple(){} - - const VCFDiploidGenotype& operator[](const U32 position) const{ return this->data_[position]; } - - bool Parse(void){ // From interface - this->data_ = reinterpret_cast(this->inputData_); - return true; - } - -public: - const VCFDiploidGenotype* data_; -}; - -class VCFLineDataComplex : public VCFLineDataInterface { -public: - VCFLineDataComplex(const U32 samples) : VCFLineDataInterface(samples), data_(samples){} - ~VCFLineDataComplex(){} - - const VCFDiploidGenotype* operator[](const U32 position) const{ return this->data_[position]; } - bool Parse(void){ - this->data_.clear(); - uint32_t search_position = 0; - uint32_t delimiters_found = 0; - while(true){ // while there is samples in line - const char* found = std::find(&this->inputData_[search_position], &this->inputData_[this->dataLength_], '\t'); - - if(*found != vcf::constants::VCF_DELIMITER){ - //std::cerr << "break no match " << (*found == '\n') << '\t' << (int)*found << '\t' << *found << '\t' << found - this->inputData_ << '/' << this->dataLength_ << std::endl; - break; - } - - //std::cerr << std::string(&this->inputData_[search_position], (found - this->inputData_ + 1) - search_position) << std::endl; - this->data_.push_back(reinterpret_cast(&this->inputData_[search_position])); - //std::cerr << this->data_[this->data_.size()-1]->snpA << '/' << this->data_[this->data_.size()-1]->snpB << std::endl; - search_position = found - this->inputData_ + 1; - ++delimiters_found; - } - - //std::cerr << std::string(&this->inputData_[search_position], this->dataLength_ - search_position + 1) << std::endl; - //std::cerr << "final: " << this->dataLength_ - search_position + 1 << '\t' << this->dataLength_ << '\t' << search_position << std::endl; - this->data_.push_back(reinterpret_cast(&this->inputData_[search_position])); - ++delimiters_found; - - if(delimiters_found == this->samples_) - return true; - - std::cerr << utility::timestamp("ERROR", "VCF") << "Found " << delimiters_found << " samples in line but expected " << this->samples_ << "..." << std::endl; - exit(1); - return false; - - } // From interface - -public: - std::vector data_; -}; - -class VCFLine{ -public: - VCFLine(const U32 samples): simple_(samples), complex_(samples){} - ~VCFLine(void){} - - inline const bool checkSeparator(const char& separator) const{ - if(separator == '/') - return true; - else if(separator == '|') - return true; - else return false; - } - - - bool Parse(const char* source, const U32 sourceLength){ - BYTE found = 0; - U32 sourceLastPosition = 0; - U32 sourceFoundPosition = 0; - - while(true){ - const char* match = std::find(&source[sourceLastPosition], &source[sourceLength], vcf::constants::VCF_DELIMITER); - if(*match != vcf::constants::VCF_DELIMITER){ - std::cerr << utility::timestamp("ERROR", "VCF") << "Illegal VCF line" << std::endl; - return false; - } - - sourceFoundPosition = match - source; - - switch(found){ - case 0: - this->CHROM = &source[0]; - this->lCHROM = sourceFoundPosition - sourceLastPosition; - break; - case 1: - this->POS = &source[sourceLastPosition]; - this->lPOS = sourceFoundPosition - sourceLastPosition; - this->position = atoi(this->POS); - break; - case 2: - this->ID = &source[sourceLastPosition]; - this->lID = sourceFoundPosition - sourceLastPosition; - break; - case 3: - this->REF = &source[sourceLastPosition]; - this->lREF = sourceFoundPosition - sourceLastPosition; - break; - case 4: - this->ALT = &source[sourceLastPosition]; - this->lALT = sourceFoundPosition - sourceLastPosition; - break; - case 5: - this->QUAL = &source[sourceLastPosition]; - this->lQUAL = sourceFoundPosition - sourceLastPosition; - break; - case 6: - this->FILTER = &source[sourceLastPosition]; - this->lFILTER = sourceFoundPosition - sourceLastPosition; - break; - case 7: - this->INFO = &source[sourceLastPosition]; - this->lINFO = sourceFoundPosition - sourceLastPosition; - break; - case 8: - this->FORMAT = &source[sourceLastPosition]; - this->lFORMAT = sourceFoundPosition - sourceLastPosition; - break; - } - - sourceLastPosition = sourceFoundPosition + 1; - ++found; - - if(found == 9){ - this->isComplex(); - this->SetReference(); - if(!this->getComplex() && this->IsSimple()){ - this->simple_.SetData(&source[sourceLastPosition], sourceLength - sourceLastPosition - 1); - this->simple_.Parse(); - } else if(this->getComplex() && this->IsSimple()){ - this->complex_.SetData(&source[sourceLastPosition], sourceLength - sourceLastPosition - 1); - this->complex_.Parse(); - } - return true; - } - } - - return false; - } - - // Complex: defined as FORMAT field equals "GT" && site is biallelic - const bool& getComplex(void) const{ return this->Complex; } - - template - const float getMissingness(const T& samples) const{ - U64 total = 0; - if(this->getComplex()){ - for(U32 i = 0; i < samples; ++i){ - if(!this->checkSeparator(this->complex_.data_[i]->separator)) - return(2); - - if(this->complex_.data_[i]->snpA == '.' || this->complex_.data_[i]->snpB == '.') - ++total; - } - } else { - for(U32 i = 0; i < samples; ++i){ - if(!this->checkSeparator(this->simple_.data_[i].separator)) - return(2); - - if(this->simple_[i].snpA == '.' || this->simple_.data_[i].snpB == '.') - ++total; - } - } - return((float)total/samples); - } - - bool isComplex(void){ - std::cerr << "Not used" << std::endl; - exit(1); - /* - if(strncmp(this->FORMAT, &vcf::constants::GT_ONLY[0], vcf::constants::GT_ONLY.size()) == 0 && this->lFORMAT == 2) - this->Complex = false; - else { - if(strncmp(this->FORMAT, &vcf::constants::GT_ONLY[0], vcf::constants::GT_ONLY.size()) == 0) - this->Complex = true; - else { - std::cerr << utility::timestamp("ERROR", "VCF") << "Could not parse GT information..." << std::endl; - exit(1); - } - } - */ - - return this->Complex; - } - const bool IsSimple(void) const{ return(this->lALT == 1 && this->lREF == 1); } - - void SetReference(void){ - this->ref_alt = 0; - - switch(this->REF[0]){ - case 'A': this->ref_alt ^= tachyon::constants::REF_ALT_A << 4; break; - case 'T': this->ref_alt ^= tachyon::constants::REF_ALT_T << 4; break; - case 'G': this->ref_alt ^= tachyon::constants::REF_ALT_G << 4; break; - case 'C': this->ref_alt ^= tachyon::constants::REF_ALT_C << 4; break; - case '.': this->ref_alt ^= tachyon::constants::REF_ALT_N << 4; break; - } - - switch(this->ALT[0]){ - case 'A': this->ref_alt ^= tachyon::constants::REF_ALT_A << 0; break; - case 'T': this->ref_alt ^= tachyon::constants::REF_ALT_T << 0; break; - case 'G': this->ref_alt ^= tachyon::constants::REF_ALT_G << 0; break; - case 'C': this->ref_alt ^= tachyon::constants::REF_ALT_C << 0; break; - case '.': this->ref_alt ^= tachyon::constants::REF_ALT_N << 0; break; - } - } - - const BYTE& getReference(void) const{ return this->ref_alt; } - -public: - U16 lCHROM; - U16 lPOS; - U16 lID; - U16 lREF; - U16 lALT; - U16 lQUAL; - U16 lFILTER; - U16 lINFO; - U16 lFORMAT; - U32 position; - bool Complex; - const char* CHROM; - const char* POS; - const char* ID; - const char* REF; - const char* ALT; - const char* QUAL; - const char* FILTER; - const char* INFO; - const char* FORMAT; - BYTE ref_alt; - VCFLineDataSimple simple_; - VCFLineDataComplex complex_; -}; - -} -} - - - -#endif /* VCFLINES_H_ */ diff --git a/lib/io/vcf_utils.cpp b/lib/io/vcf_utils.cpp new file mode 100644 index 0000000..5484fd7 --- /dev/null +++ b/lib/io/vcf_utils.cpp @@ -0,0 +1,18 @@ +#include "vcf_utils.h" + +namespace tachyon { +namespace io { + +const bcf_hrec_t* GetPopulatedHrec(const bcf_idpair_t& idPair) { + for (int i = 0; i < 3; i++) { + const bcf_hrec_t* hrec = idPair.val->hrec[i]; + if (hrec != nullptr) { + return hrec; + } + } + std::cerr << "No populated hrec in idPair. Error in htslib." << std::endl; + return nullptr; +} + +} +} diff --git a/lib/io/vcf_utils.h b/lib/io/vcf_utils.h new file mode 100644 index 0000000..22cf73f --- /dev/null +++ b/lib/io/vcf_utils.h @@ -0,0 +1,1116 @@ +#ifndef IO_HTSLIB_INTEGRATION_H_ +#define IO_HTSLIB_INTEGRATION_H_ + +#include +#include +#include +#include +#include + +#include "htslib/kstring.h" +#include "htslib/vcf.h" +#include "htslib/hts.h" + +#include "support/helpers.h" +#include "io/basic_buffer.h" + +namespace tachyon { +namespace io { + +const std::vector BCF_TYPE_LOOKUP = {"NULL","INT8","INT16","INT32", + "ERROR","FLOAT","ERROR","CHAR"}; + +struct VcfContig { +public: + VcfContig() : idx(0), n_bases(0){} + ~VcfContig() = default; + + std::string ToVcfString(const bool is_bcf = false) const{ + // Template: + // ##contig= + std::string ret = "##contig=name; + if(extra.size()){ + ret += "," + this->extra[0].first + "=" + this->extra[0].second; + for(U32 i = 1; i < this->extra.size(); ++i){ + ret += "," + this->extra[i].first + "=" + this->extra[i].second; + } + } + if(this->description.size()) ret += ",Description=" + this->description; + ret += ",length=" + std::to_string(this->n_bases); + if(is_bcf) ret += ",IDX=" + std::to_string(this->idx); + ret += ">"; + return(ret); + } + +public: + // Required. The internal identifier for this field + uint32_t idx; + + // Required. The name of the contig. Canonically this is the first + // non-whitespace-containing string after the > marker in a FASTA file. + // For example, the line: + // >chr1 more info here + // has a name of "chr1" and a description of "more info here" + std::string name; + + // Ideally this record is filled in as described above, but not all FASTA + // readers capture the description information after the name. Since a + // description is not required by the FASTA spec, we cannot distinguish cases + // where a description was not present and where a parser ignored it. + std::string description; + + // The length of this contig in basepairs. + int64_t n_bases; + + // Additional information used when reading and writing VCF headers. An + // example map of key-value extra fields would transform an input line + // containing 'assembly=B36,taxonomy=x,species="Homo sapiens"' to a map with + // "assembly" -> "B36", "taxonomy" -> "x", "species" -> "Homo sapiens". We + // never use this information internally, other than reading it in so we can + // write the contig out again. + std::vector< std::pair > extra; +}; + +// Temp declare +struct VcfInfo{ +public: + VcfInfo() : idx(0){} + ~VcfInfo() = default; + + std::string ToVcfString(const bool is_bcf = false) const{ + // Template: + // ##INFO= + std::string ret = "##INFO=id; + ret += ",Number=" + this->number; + ret += ",Type=" + this->type; + ret += ",Description=" + this->description; + if(this->source.size()) ret += ",Source=" + this->source; + if(this->source.size()) ret += ",Version=" + this->version; + if(is_bcf) ret += ",IDX=" + std::to_string(this->idx); + ret += ">"; + return(ret); + } + + std::string ToVcfString(const uint32_t idx) const{ + // Template: + // ##INFO= + std::string ret = "##INFO=id; + ret += ",Number=" + this->number; + ret += ",Type=" + this->type; + ret += ",Description=" + this->description; + if(this->source.size()) ret += ",Source=" + this->source; + if(this->source.size()) ret += ",Version=" + this->version; + ret += ",IDX=" + std::to_string(idx); + ret += ">"; + return(ret); + } + +public: + // Required. The internal identifier for this field + uint32_t idx; + + // Required. The unique ID of the INFO field. Examples include "MQ0" or "END". + std::string id; + + // Required. The number of values included with the info field. This should be + // the string representation of the number, e.g. "1" for a single entry, "2" + // for a pair of entries, etc. Special cases arise when the number of entries + // depend on attributes of the Variant or are unknown in advance, and include: + // "A": The field has one value per alternate allele. + // "R": The field has one value per allele (including the reference). + // "G": The field has one value for each possible genotype. + // ".": The number of values varies, is unknown, or is unbounded. + std::string number; + + // Required. The type of the INFO field. Valid values are "Integer", "Float", + // "Flag", "Character", and "String". + std::string type; + + // Required by VCF. The description of the field. + std::string description; + + // Optional. The annotation source used to generate the field. + std::string source; + + // Optional. The version of the annotation source used to generate the field. + std::string version; +}; + +struct VcfFormat{ +public: + VcfFormat() : idx(0){} + ~VcfFormat() = default; + + std::string ToVcfString(const bool is_bcf = false) const{ + // Template: + // ##FORMAT= + std::string ret = "##FORMAT=id; + ret += ",Number=" + this->number; + ret += ",Type=" + this->type; + ret += ",Description=" + this->description; + if(is_bcf) ret += ",IDX=" + std::to_string(this->idx); + ret += ">"; + return(ret); + } + + std::string ToVcfString(const uint32_t idx) const{ + // Template: + // ##FORMAT= + std::string ret = "##FORMAT=id; + ret += ",Number=" + this->number; + ret += ",Type=" + this->type; + ret += ",Description=" + this->description; + ret += ",IDX=" + std::to_string(idx); + ret += ">"; + return(ret); + } + +public: + // Required. The unique ID of the FORMAT field. Examples include "GT", "PL". + std::string id; + + // Required. The internal identifier for this field + uint32_t idx; + + // Required. The number of entries expected. See description above in the + // VcfInfo message. + std::string number; + + // Required. The type of the field. Valid values are "Integer", "Float", + // "Character", and "String" (same as INFO except "Flag" is not supported). + std::string type; + + // Required by VCF. The description of the field. + std::string description; +}; + +struct VcfFilter{ +public: + VcfFilter() : idx(0){} + ~VcfFilter() = default; + + std::string ToVcfString(const bool is_bcf = false) const{ + // Template: + // ##FORMAT= + std::string ret = "##FILTER=id; + ret += ",Description=" + this->description; + if(is_bcf) ret += ",IDX=" + std::to_string(this->idx); + ret += ">"; + return(ret); + } + + std::string ToVcfString(const uint32_t idx) const{ + // Template: + // ##FORMAT= + std::string ret = "##FILTER=id; + ret += ",Description=" + this->description; + ret += ",IDX=" + std::to_string(idx); + ret += ">"; + return(ret); + } + + friend std::ostream& operator<<(std::ostream& stream, const VcfFilter& flt){ + stream.write((const char*)&flt.idx, sizeof(uint32_t)); + + utility::SerializeString(flt.id, stream); + utility::SerializeString(flt.description, stream); + + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, VcfFilter& flt){ + stream.read((char*)&flt.idx, sizeof(uint32_t)); + + utility::DeserializeString(flt.id, stream); + utility::DeserializeString(flt.description, stream); + + return(stream); + } + + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const VcfFilter& flt){ + io::SerializePrimitive(flt.idx, buffer); + io::SerializeString(flt.id, buffer); + io::SerializeString(flt.description, buffer); + return(buffer); + } + + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, VcfFilter& flt){ + io::DeserializePrimitive(flt.idx, buffer); + io::DeserializeString(flt.id, buffer); + io::DeserializeString(flt.description, buffer); + return(buffer); + } + +public: + // Required. The internal identifier for this field + uint32_t idx; + + // Required. The unique ID of the filter. Examples include "PASS", "RefCall". + std::string id; + + // Required by VCF. The description of the filter. + std::string description; +}; + +// This record type is a catch-all for other types of headers. For example, +// ##pedigreeDB=http://url_of_pedigrees +// The VcfExtra message would represent this with key="pedigreeDB", +// value="http://url_of_pedigrees". +struct VcfExtra{ +public: + VcfExtra() = default; + VcfExtra(const std::string& key, const std::string& value) : + key(key), + value(value) + {} + + ~VcfExtra() = default; + + std::string ToVcfString(void) const{ + // Template: + // ##source=CombineGVCFs + std::string ret = "##" + this->key + "=" + this->value; + return(ret); + } + + friend std::ostream& operator<<(std::ostream& stream, const VcfExtra& extra){ + utility::SerializeString(extra.key, stream); + utility::SerializeString(extra.value, stream); + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, VcfExtra& extra){ + utility::DeserializeString(extra.key, stream); + utility::DeserializeString(extra.value, stream); + return(stream); + } + + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const VcfExtra& extra){ + io::SerializeString(extra.key, buffer); + io::SerializeString(extra.value, buffer); + return(buffer); + } + + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, VcfExtra& extra){ + io::DeserializeString(extra.key, buffer); + io::DeserializeString(extra.value, buffer); + return(buffer); + } + +public: + // Required by VCF. The key of the extra header field. Note that this key does + // not have to be unique within a VcfHeader. + std::string key; + + // Required by VCF. The value of the extra header field. + std::string value; +}; + +// This record type is a catch-all for other headers containing multiple +// key-value pairs. For example, headers may have META lines that provide +// metadata about the VCF as a whole, e.g. +// ##META= +// The VcfStructuredExtra message would represent this with key="META", +// and fields mapping "ID" -> "Assay", "Type" -> "String", etc. +struct VcfStructuredExtra{ +public: + VcfStructuredExtra() = default; + ~VcfStructuredExtra() = default; + + std::string ToVcfString(void) const{ + // Template: + // ##META= + std::string ret = "##" + this->key + "=<"; + ret += this->fields[0].key + "=" + this->fields[0].value; + for(U32 i = 1; i < this->fields.size(); ++i) + ret += "," + this->fields[i].key + "=" + this->fields[i].value; + ret += ">"; + return(ret); + } + + friend std::ostream& operator<<(std::ostream& stream, const VcfStructuredExtra& extra){ + utility::SerializeString(extra.key, stream); + size_t l_extra = extra.fields.size(); + stream.write((const char*)&l_extra, sizeof(size_t)); + for(U32 i = 0; i < extra.fields.size(); ++i) + stream << extra.fields[i]; + + return(stream); + } + + friend std::istream& operator>>(std::istream& stream, VcfStructuredExtra& extra){ + utility::DeserializeString(extra.key, stream); + size_t l_extra; + stream.read((char*)&l_extra, sizeof(size_t)); + extra.fields.resize(l_extra); + for(U32 i = 0; i < extra.fields.size(); ++i) + stream >> extra.fields[i]; + + return(stream); + } + + friend io::BasicBuffer& operator<<(io::BasicBuffer& buffer, const VcfStructuredExtra& extra){ + io::SerializeString(extra.key, buffer); + size_t l_extra = extra.fields.size(); + io::SerializePrimitive(l_extra, buffer); + for(U32 i = 0; i < extra.fields.size(); ++i) + buffer << extra.fields[i]; + + return(buffer); + } + + friend io::BasicBuffer& operator>>(io::BasicBuffer& buffer, VcfStructuredExtra& extra){ + io::DeserializeString(extra.key, buffer); + size_t l_extra; + io::DeserializePrimitive(l_extra, buffer); + extra.fields.resize(l_extra); + for(U32 i = 0; i < extra.fields.size(); ++i) + buffer >> extra.fields[i]; + + return(buffer); + } + +public: + // Required by VCF. The key of the extra header field. Note that this key does + // not have to be unique within a VcfHeader. + std::string key; + + // Required by VCF. The key=value pairs contained in the structure. + std::vector fields; +}; + +// +// ----------------------------------------------------------------------------- +// VCF type encoding utilities +template +struct VcfType { + // Predicates for checking missing and sentinel entries. Use these, not ==. + // Is argument the "missing" value? + static bool IsMissing(T); + // Is argument the vector end sentinel value? + static bool IsVectorEnd(T); +}; + +// See interface description comment above. +template<> +struct VcfType { + static bool IsMissing(int8_t v) { return (v == bcf_int8_missing); } + static bool IsVectorEnd(int8_t v) { return (v == bcf_int8_vector_end); } +}; + +// See interface description comment above. +template<> +struct VcfType { + static bool IsMissing(int16_t v) { return (v == bcf_int16_missing); } + static bool IsVectorEnd(int16_t v) { return (v == bcf_int16_vector_end); } +}; + +// See interface description comment above. +template<> +struct VcfType { + static bool IsMissing(int v) { return (v == bcf_int32_missing); } + static bool IsVectorEnd(int v) { return (v == bcf_int32_vector_end); } +}; + +// See interface description comment above. +template<> +struct VcfType { + static bool IsMissing(float v) { return bcf_float_is_missing(v); } + static bool IsVectorEnd(float v) { return bcf_float_is_vector_end(v); } +}; + + +template +struct VcfGenotype { + // Predicates for checking missing and sentinel entries. + static bool IsMissing(const T& value){ return(value == bcf_gt_missing); } +}; + +// Genotype helper +struct VcfGenotypeSummary{ +public: + VcfGenotypeSummary(void) : + base_ploidy(0), + phase_if_uniform(0), + mixed_phasing(false), + invariant(false), + n_missing(0), + n_vector_end(0) + {} + + ~VcfGenotypeSummary() = default; + + /**< + * Gathers summary statistics for a vector of genotypes + * at a given site. Collects information regarding the + * number of missing genotypes and count of sentinel + * nodes, checks if the phasing is uniform and whether + * all the genotypes are identical. + * Todo: Use hashes to check for uniformity of genotypes. + * @param n_samples Total number of samples in the input vector. + * This is equivalent to the samples in the file. + * @param fmt The target htslib format structure. + * @return Returns TRUE upon success or FALSE otherwise. + */ + template bool evaluate(const size_t& n_samples, const bcf_fmt_t& fmt){ + if(fmt.p_len == 0) return true; + assert(fmt.size/fmt.n == sizeof(T)); + + // Set the base ploidy. This corresponds to the LARGEST + // ploidy observed of ANY individual sample at the given + // locus. If a genotype has a ploidy < base ploidy then + // it is trailed with the sentinel node symbol to signal + // that the remainder of the vector is NULL. + this->base_ploidy = fmt.n; + + // Find first phase + this->phase_if_uniform = 0; + // Iterate over genotypes to find the first valid phase + // continuing the search if the current value is the + // sentinel node symbol. + int j = fmt.n - 1; + for(uint32_t i = 0; i < n_samples; ++i){ + if(VcfGenotype::IsMissing(fmt.p[j]) == true + || VcfType::IsVectorEnd(fmt.p[j]) == true) + j += fmt.n; + else { + this->phase_if_uniform = fmt.p[j] & 1; + break; + } + } + + // Iterate over genotypes to compute summary statistics + // regarding missingness, number of special sentinel + // symbols and assess uniformity of phasing. + j = fmt.n - 1; + for(uint32_t i = 0; i < n_samples; ++i){ + if(VcfGenotype::IsMissing(fmt.p[j]) == false + && VcfType::IsVectorEnd(fmt.p[j]) == false + && (fmt.p[j] & 1) != this->phase_if_uniform) + { + this->mixed_phasing = true; + } + + // Iterate over the number of chromosomes / individual + for(int k = 0; k < fmt.n; ++k, ++j){ + this->n_missing += VcfGenotype::IsMissing(fmt.p[j]); + this->n_vector_end += VcfType::IsVectorEnd(fmt.p[j]); + } + } + + return true; + } + + inline bool isBaseDiploid(void) const{ return(this->base_ploidy == 2); } + +public: + uint8_t base_ploidy; + bool phase_if_uniform; + bool mixed_phasing; + bool invariant; + uint64_t n_missing; + uint64_t n_vector_end; +}; + +// Returns the hrec that contains information or nullptr if none does. +const bcf_hrec_t* GetPopulatedHrec(const bcf_idpair_t& idPair); + +class VcfHeader{ +public: + typedef VcfHeader self_type; + typedef VcfContig contig_type; + typedef bcf_hdr_t hts_vcf_header; + typedef VcfFormat format_type; + typedef VcfInfo info_type; + typedef VcfFilter filter_type; + typedef VcfStructuredExtra structured_extra_type; + typedef VcfExtra extra_type; + typedef std::unordered_map map_type; + typedef std::unordered_map map_reverse_type; + +public: + VcfHeader() = default; + VcfHeader(const VcfHeader& other) : + fileformat_string_(other.fileformat_string_), + literals_(other.literals_), + samples_(other.samples_), + contigs_(other.contigs_), + info_fields_(other.info_fields_), + format_fields_(other.format_fields_), + filter_fields_(other.filter_fields_), + structured_extra_fields_(other.structured_extra_fields_), + extra_fields_(other.extra_fields_) + { + this->BuildMaps(); + this->BuildReverseMaps(); + } + + ~VcfHeader() = default; + + inline size_t GetNumberSamples(void) const{ return(this->samples_.size()); } + inline size_t GetNumberContigs(void) const{ return(this->contigs_.size()); } + + // Adds Contig information from the idPair to the ContigInfo object. + void AddContigInfo(const bcf_idpair_t& idPair) { + // ID and length are special-cased in the idPair. + //std::cerr << "Contig: " << pos_in_fasta << "\t" << std::string(idPair.key) << ": " << idPair.val->info[0] << std::endl; + VcfContig c; + c.name = idPair.key; + c.n_bases = idPair.val->info[0]; + + const bcf_hrec_t* hrec0 = GetPopulatedHrec(idPair); + if (hrec0 != nullptr) { + for (int j = 0; j < hrec0->nkeys; j++) { + const std::string current_key(hrec0->keys[j]); + // Add any non-ID and non-length info to the structured map of additional + // information. + if (current_key == "ID" || + current_key == "length") + { + //continue; + } else if(current_key == "IDX"){ + c.idx = atoi(hrec0->vals[j]); + } else { + c.extra.push_back(std::pair(current_key, std::string(hrec0->vals[j]))); + } + } + } else { + std::cerr << utility::timestamp("ERROR") << "hrec error" << std::endl; + return; + } + + // Add current contig to map + if(this->contigs_.size() == 0){ + this->contigs_.push_back(c); + this->contigs_map_[c.name] = 0; + return; + } + + if(this->contigs_map_.find(c.name) == this->contigs_map_.end()){ + this->contigs_map_[c.name] = this->contigs_.size(); + this->contigs_.push_back(c); + } else { + std::cerr << utility::timestamp("ERROR") << "Illegal: duplicated contig name" << std::endl; + exit(1); + } + } + + // Adds FILTER information from the bcf_hrec_t to the VcfFilterInfo object. + void AddFilterInfo(const bcf_hrec_t* hrec) { + if (hrec->nkeys >= 2 && std::string(hrec->keys[0]) == "ID" && + std::string(hrec->keys[1]) == "Description") + { + VcfFilter f; + f.id = std::string(hrec->vals[0]); + f.description = std::string(hrec->vals[1]); + for(int i = 2; i < hrec->nkeys; ++i){ + if(std::string(hrec->keys[i]) == "IDX"){ + f.idx = atoi(hrec->vals[i]); + } + } + + // Add current filter field to map. + if(this->filter_fields_.size() == 0){ + this->filter_fields_.push_back(f); + this->filter_fields_map_[f.id] = 0; + return; + } + + if(this->filter_fields_map_.find(f.id) == this->filter_fields_map_.end()){ + this->filter_fields_map_[f.id] = this->filter_fields_.size(); + this->filter_fields_.push_back(f); + } else { + std::cerr << utility::timestamp("ERROR") << "Illegal: duplicated filter name: " << f.id << std::endl; + exit(1); + } + + } else { + std::cerr << utility::timestamp("ERROR") << "Malformed FILTER field detected in header, leaving this " + "filter empty" << std::endl; + } + } + + // Adds INFO information from the bcf_hrec_t to the VcfInfo object. + void AddInfo(const bcf_hrec_t* hrec) { + if (hrec->nkeys >= 4 && std::string(hrec->keys[0]) == "ID" && + std::string(hrec->keys[1]) == "Number" && std::string(hrec->keys[2]) == "Type" && + std::string(hrec->keys[3]) == "Description") + { + VcfInfo f; + f.id = std::string(hrec->vals[0]); + f.number = std::string(hrec->vals[1]); + f.type = std::string(hrec->vals[2]); + f.description = std::string(hrec->vals[3]); + for (int i = 4; i < hrec->nkeys; i++) { + const std::string current_key = std::string(hrec->keys[i]); + if (current_key == "Source") { + f.source = std::string(hrec->vals[i]); + } else if (current_key == "Version") { + f.version = std::string(hrec->vals[i]); + } else if (current_key == "IDX") { + f.idx = atoi(hrec->vals[i]); + } + } + + // Add current info field to map. + if(this->info_fields_.size() == 0){ + this->info_fields_.push_back(f); + this->info_fields_map_[f.id] = 0; + return; + } + + if(this->info_fields_map_.find(f.id) == this->info_fields_map_.end()){ + this->info_fields_map_[f.id] = this->info_fields_.size(); + this->info_fields_.push_back(f); + } else { + std::cerr << utility::timestamp("ERROR") << "Illegal: duplicated info name: " << f.id << std::endl; + exit(1); + } + + } else { + std::cerr << utility::timestamp("ERROR") << "Malformed INFO field detected in header, leaving this " + "info empty" << std::endl; + } + } + + // Adds FORMAT information from the bcf_hrec_t to the VcfFormatInfo object. + void AddFormatInfo(const bcf_hrec_t* hrec) { + if (hrec->nkeys >= 4 && std::string(hrec->keys[0]) == "ID" && + std::string(hrec->keys[1]) == "Number" && std::string(hrec->keys[2]) == "Type" && + std::string(hrec->keys[3]) == "Description") + { + VcfFormat f; + f.id = std::string(hrec->vals[0]); + f.number = std::string(hrec->vals[1]); + f.type = std::string(hrec->vals[2]); + f.description = std::string(hrec->vals[3]); + for (int i = 4; i < hrec->nkeys; i++) { + const std::string current_key = std::string(hrec->keys[i]); + if (current_key == "IDX") { + f.idx = atoi(hrec->vals[i]); + } + } + + // Add current format field to map. + if(this->format_fields_.size() == 0){ + this->format_fields_.push_back(f); + this->format_fields_map_[f.id] = 0; + return; + } + + if(this->format_fields_map_.find(f.id) == this->format_fields_map_.end()){ + this->format_fields_map_[f.id] = this->format_fields_.size(); + this->format_fields_.push_back(f); + } else { + std::cerr << utility::timestamp("ERROR") << "Illegal: duplicated format name: " << f.id << std::endl; + exit(1); + } + + } else { + std::cerr << utility::timestamp("ERROR") << "Malformed FORMAT field detected in header, leaving this " + "format empty" << std::endl; + } + } + + // Adds structured information from the bcf_hrec_t to the VcfStructuredExtra. + void AddStructuredExtra(const bcf_hrec_t* hrec) { + VcfStructuredExtra f; + f.key = std::string(hrec->key); + for (int i = 0; i < hrec->nkeys; i++) + f.fields.push_back(VcfExtra(std::string(hrec->keys[i]), std::string(hrec->vals[i]))); + + this->structured_extra_fields_.push_back(f); + } + + // Adds unstructured information from the bcf_hrec_t to the VcfExtra object. + void AddExtra(const bcf_hrec_t* hrec) { + VcfExtra f; + f.key = std::string(hrec->key); + f.value = std::string(hrec->value); + this->extra_fields_.push_back(f); + } + + void AddSample(const std::string& sample_name) { + if(this->samples_.size() == 0){ + this->samples_.push_back(sample_name); + this->samples_map_[sample_name] = 0; + return; + } + + if(this->samples_map_.find(sample_name) == this->samples_map_.end()){ + this->samples_map_[sample_name] = this->samples_.size(); + this->samples_.push_back(sample_name); + } else { + std::cerr << utility::timestamp("ERROR") << "Illegal: duplicated sample name: " << sample_name << std::endl; + exit(1); + } + } + + VcfContig* GetContig(const std::string& name) { + map_type::const_iterator it = this->contigs_map_.find(name); + if(it != this->contigs_map_.end()) return(&this->contigs_[it->second]); + return(nullptr); + } + + VcfContig* GetContig(const int& idx) { + map_reverse_type::const_iterator it = this->contigs_reverse_map_.find(idx); + if(it != this->contigs_reverse_map_.end()) return(&this->contigs_[it->second]); + return(nullptr); + } + + VcfInfo* GetInfo(const std::string& name) { + map_type::const_iterator it = this->info_fields_map_.find(name); + if(it != this->info_fields_map_.end()) return(&this->info_fields_[it->second]); + return(nullptr); + } + + VcfInfo* GetInfo(const int& idx) { + map_reverse_type::const_iterator it = this->info_fields_reverse_map_.find(idx); + if(it != this->info_fields_reverse_map_.end()) return(&this->info_fields_[it->second]); + return(nullptr); + } + + VcfFormat* GetFormat(const std::string& name) { + map_type::const_iterator it = this->format_fields_map_.find(name); + if(it != this->format_fields_map_.end()) return(&this->format_fields_[it->second]); + return(nullptr); + } + + VcfFormat* GetFormat(const int& idx) { + map_reverse_type::const_iterator it = this->format_fields_reverse_map_.find(idx); + if(it != this->format_fields_reverse_map_.end()) return(&this->format_fields_[it->second]); + return(nullptr); + } + + VcfFilter* GetFilter(const std::string& name) { + map_type::const_iterator it = this->filter_fields_map_.find(name); + if(it != this->filter_fields_map_.end()) return(&this->filter_fields_[it->second]); + return(nullptr); + } + + VcfFilter* GetFilter(const int& idx) { + map_reverse_type::const_iterator it = this->filter_fields_reverse_map_.find(idx); + if(it != this->filter_fields_reverse_map_.end()) return(&this->filter_fields_[it->second]); + return(nullptr); + } + + std::string* GetSample(const std::string& name) { + map_type::const_iterator it = this->samples_map_.find(name); + if(it != this->samples_map_.end()) return(&this->samples_[it->second]); + return(nullptr); + } + + bool BuildReverseMaps(void){ + this->contigs_reverse_map_.clear(); + this->info_fields_reverse_map_.clear(); + this->format_fields_reverse_map_.clear(); + this->filter_fields_reverse_map_.clear(); + + for(uint32_t i = 0; i < this->contigs_.size(); ++i) this->contigs_reverse_map_[this->contigs_[i].idx] = i; + for(uint32_t i = 0; i < this->info_fields_.size(); ++i) this->info_fields_reverse_map_[this->info_fields_[i].idx] = i; + for(uint32_t i = 0; i < this->format_fields_.size(); ++i) this->format_fields_reverse_map_[this->format_fields_[i].idx] = i; + for(uint32_t i = 0; i < this->filter_fields_.size(); ++i) this->filter_fields_reverse_map_[this->filter_fields_[i].idx] = i; + + return true; + } + + bool BuildMaps(void){ + this->info_fields_map_.clear(); + this->format_fields_map_.clear(); + this->filter_fields_map_.clear(); + this->contigs_map_.clear(); + + for(uint32_t i = 0; i < this->contigs_.size(); ++i) this->contigs_map_[this->contigs_[i].name] = i; + for(uint32_t i = 0; i < this->info_fields_.size(); ++i) this->info_fields_map_[this->info_fields_[i].id] = i; + for(uint32_t i = 0; i < this->format_fields_.size(); ++i) this->format_fields_map_[this->format_fields_[i].id] = i; + for(uint32_t i = 0; i < this->filter_fields_.size(); ++i) this->filter_fields_map_[this->filter_fields_[i].id] = i; + + return true; + } + + /**< + * Recodes the internal IDX field for contig info, INFO, FORMAT, and FILTER + * from any range to the range [0, 1, ..., n-1] as desired in Tachyon. + * @return Returns TRUE upon success or FALSE otherwise. + */ + bool RecodeIndices(void){ + for(uint32_t i = 0; i < this->contigs_.size(); ++i) this->contigs_[i].idx = i; + for(uint32_t i = 0; i < this->info_fields_.size(); ++i) this->info_fields_[i].idx = i; + for(uint32_t i = 0; i < this->format_fields_.size(); ++i) this->format_fields_[i].idx = i; + for(uint32_t i = 0; i < this->filter_fields_.size(); ++i) this->filter_fields_[i].idx = i; + + if(this->BuildMaps() == false) return false; + if(this->BuildReverseMaps() == false) return false; + return true; + } + + /**< + * Converts this header object into a hts_vcf_header object from the + * internally stored literal string. This object is required for + * writing out VCF/BCF files. + * @return + */ + hts_vcf_header* ConvertVcfHeader(void){ + std::string internal = this->literals_; + internal += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if(this->samples_.size()){ + internal += "\tFORMAT\t"; + internal += this->samples_[0]; + for(size_t i = 1; i < this->samples_.size(); ++i) + internal += "\t" + this->samples_[i]; + } + internal += "\n"; + + hts_vcf_header* hdr = bcf_hdr_init("r"); + int ret = bcf_hdr_parse(hdr, (char*)internal.c_str()); + if(ret != 0){ + std::cerr << utility::timestamp("ERROR") << "Failed to get bcf header from literals" << std::endl; + bcf_hdr_destroy(hdr); + return(nullptr); + } + + return(hdr); + } + + // Append a string to the literal string + inline void AppendLiteralString(const std::string& literal_addition){ this->literals_ += literal_addition; } + +public: + // VCF file version string. + std::string fileformat_string_; + // Literal string for VcfHeader data. Contains all of the Vcf header data up + // to the start of the main header line ("#CHROM"...). As such, sample names + // are not available in this string and needs to be appended before converting + // back into a htslib vcf header. + std::string literals_; + + // Vcf header lines parse into: + // Samples: Individual sample names. + // VcfContig: Information relating to the interpretation of a contig. Data + // include its name, length in bases, its internal index identifier + // and optional additional information. + // VcfInfo: Data specifying a given INFO field + // VcfFormat: Data specifying a given FORMAT field + // VcfFilter: Data specifying a given FILTER field + // VcfStructuredExtra: + std::vector samples_; + std::vector contigs_; + std::vector info_fields_; + std::vector format_fields_; + std::vector filter_fields_; + std::vector structured_extra_fields_; + std::vector extra_fields_; + + // Utility members + // + // Hash tables allowing the mapping from the unique identifier string + // (such as contig name) to the relative index offset of that object. + // This approach requires another layer of indirection when mapping + // from the index to the actual target. For example: + // + // contigs[contigs_map_["chr20"].second] <- maps to the actual target + // + // The reverse maps allows the mapping from a unique IDX identifier + // to the relative index offset of that object. As above, this requires + // an addition indirect lookup to access the desired object. For example + // mapping the first occuring FORMAT field to its name: + // + // reader->vcf_header_.format_fields_[reader->vcf_header_.format_fields_reverse_map_[container.at(0)->d.fmt[0].id]].id + // + // map_type hash tables permits mapping string name -> index offset + // map_reverse_type hash tables permits mapping integer IDX -> index offset + map_type samples_map_; + map_type contigs_map_; + map_type info_fields_map_; + map_type format_fields_map_; + map_type filter_fields_map_; + map_reverse_type contigs_reverse_map_; // map IDX -> index offset + map_reverse_type info_fields_reverse_map_; // map IDX -> index offset + map_reverse_type format_fields_reverse_map_; // map IDX -> index offset + map_reverse_type filter_fields_reverse_map_; // map IDX -> index offset +}; + +class VcfReader{ +public: + typedef VcfReader self_type; + +public: + // Singleton design pattern for retrieving a guaranteed unique pointer + // to a VcfReader. This choice is to prevent inadvertent writes to the + // target file as another file-handle is accessing it. + static std::unique_ptr FromFile(const std::string& variants_path){ + htsFile* fp = hts_open(variants_path.c_str(), "r"); + if (fp == nullptr) { + std::cerr << utility::timestamp("ERROR") << "Could not open " << variants_path << std::endl; + return nullptr; + } + + bcf_hdr_t* header = bcf_hdr_read(fp); + if (header == nullptr){ + std::cerr << utility::timestamp("ERROR") << "Couldn't parse header for " << fp->fn << std::endl; + return nullptr; + } + + return std::unique_ptr(new self_type(variants_path, fp, header)); + } + + bool next(const int unpack_level = BCF_UN_ALL){ + if (bcf_read(this->fp_, this->header_, this->bcf1_) < 0) { + if (bcf1_->errcode) { + std::cerr << utility::timestamp("ERROR") << "Failed to parse VCF record: " << bcf1_->errcode << std::endl; + return false; + } else { + return false; + } + } + + bcf_unpack(this->bcf1_, unpack_level); + return true; + } + + bool next(bcf1_t* bcf_entry, const int unpack_level = BCF_UN_ALL){ + if (bcf_read(this->fp_, this->header_, bcf_entry) < 0) { + if (bcf_entry->errcode) { + std::cerr << utility::timestamp("ERROR") << "Failed to parse VCF record: " << bcf1_->errcode << std::endl; + return false; + } else { + //std::cerr << utility::timestamp("ERROR") << "Failed to retrieve a htslib bcf1_t record!" << std::endl; + return false; + } + } + + bcf_unpack(bcf_entry, unpack_level); + return true; + } + + /**< + * Utility function that writes the VcfHeader literals string into + * a target output stream. The literals string does NOT contain + * sample information or the column header string ("#CHROM..."). + * @param stream Target output stream + */ + inline void PrintLiterals(std::ostream& stream) const{ stream << this->vcf_header_.literals_ << std::endl; } + + /**< + * Utility function that writes a valid VCF header output string + * to the target stream. + * @param stream Target output stream + */ + void PrintVcfHeader(std::ostream& stream) const{ + this->PrintLiterals(stream); + stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if(this->vcf_header_.samples_.size()){ + stream << "\tFORMAT\t"; + stream << this->vcf_header_.samples_[0]; + for(size_t i = 1; i < this->vcf_header_.samples_.size(); ++i) + stream << "\t" + this->vcf_header_.samples_[i]; + } + stream << "\n"; + } + +private: + VcfReader(const std::string& variants_path, + htsFile* fp, + bcf_hdr_t* header) : + fp_(fp), + header_(header), + bcf1_(bcf_init()) +{ + if (this->header_->nhrec < 1) { + std::cerr << utility::timestamp("ERROR") << "Empty header, not a valid VCF." << std::endl; + return; + } + + // Store the file-format header string + if (std::string(this->header_->hrec[0]->key) != "fileformat") { + std::cerr << utility::timestamp("ERROR") << "Not a valid VCF, fileformat needed: " << variants_path << std::endl; + } else { + this->vcf_header_.fileformat_string_ = std::string(this->header_->hrec[0]->key); + } + + // Fill in the contig info for each contig in the VCF header. Directly + // accesses the low-level C struct because there are no indirection + // macros/functions by htslib API. + // BCF_DT_CTG: offset for contig (CTG) information in BCF dictionary (DT). + const int n_contigs = this->header_->n[BCF_DT_CTG]; + for (int i = 0; i < n_contigs; ++i) { + const bcf_idpair_t& idPair = this->header_->id[BCF_DT_CTG][i]; + this->vcf_header_.AddContigInfo(idPair); + } + + // Iterate through all hrecs (except the first, which was 'fileformat') to + // populate the rest of the headers. + for (int i = 1; i < this->header_->nhrec; i++) { + const bcf_hrec_t* hrec0 = this->header_->hrec[i]; + switch (hrec0->type) { + case BCF_HL_CTG: + // Contigs are populated above, since they store length in the + // bcf_idinfo_t* structure. + break; + case BCF_HL_FLT: + this->vcf_header_.AddFilterInfo(hrec0); + break; + case BCF_HL_INFO: + this->vcf_header_.AddInfo(hrec0); + break; + case BCF_HL_FMT: + this->vcf_header_.AddFormatInfo(hrec0); + break; + case BCF_HL_STR: + this->vcf_header_.AddStructuredExtra(hrec0); + break; + case BCF_HL_GEN: + this->vcf_header_.AddExtra(hrec0); + break; + default: + std::cerr << utility::timestamp("ERROR") << "Unknown hrec0->type: " << hrec0->type << std::endl; + break; + } + } + + // Populate samples info. + int n_samples = bcf_hdr_nsamples(this->header_); + for (int i = 0; i < n_samples; i++) + this->vcf_header_.AddSample(std::string(this->header_->samples[i])); + + this->vcf_header_.BuildReverseMaps(); + + // Build literal VCF header string for storage. + kstring_t htxt = {0,0,0}; + bcf_hdr_format(this->header_, 0, &htxt); + while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros + std::string temp = std::string(htxt.s, htxt.l); + size_t pos = temp.find("#CHROM"); // search for start of column header line + temp = temp.substr(0, pos); + this->vcf_header_.literals_ = temp; + free(htxt.s); +} + +public: + ~VcfReader() { + bcf_destroy(this->bcf1_); + bcf_hdr_destroy(this->header_); + hts_close(this->fp_); + } + +public: + // Contextual representation of vcf header + VcfHeader vcf_header_; + + // A pointer to the htslib file used to access the VCF data. + htsFile * fp_; + + // A htslib header data structure obtained by parsing the header of this VCF. + bcf_hdr_t * header_; + + // htslib representation of a parsed vcf line. + bcf1_t* bcf1_; +}; + +} + +} + + + +#endif /* IO_HTSLIB_INTEGRATION_H_ */ diff --git a/lib/main.cpp b/lib/main.cpp index a0d09ef..a1a3fd4 100644 --- a/lib/main.cpp +++ b/lib/main.cpp @@ -35,8 +35,7 @@ int main(int argc, char** argv){ } if(argc == 1){ - programMessage(); - programHelpDetailed(); + programHelp(); return(1); } @@ -52,13 +51,13 @@ int main(int argc, char** argv){ } else if(strncmp(subroutine.data(), "view", 4) == 0 && subroutine.size() == 4){ return(view(argc, argv)); } else if(strncmp(subroutine.data(), "stats", 5) == 0 && subroutine.size() == 5){ - return(stats(argc, argv)); + std::cerr << "Not implemented" << std::endl; + return(0); } else if(strncmp(subroutine.data(), "check", 5) == 0 && subroutine.size() == 5){ std::cerr << "Not implemented" << std::endl; return(0); } else { - programMessage(); - programHelpDetailed(); + programHelp(); std::cerr << tachyon::utility::timestamp("ERROR") << "Illegal command: " << subroutine << std::endl; return(1); } diff --git a/lib/math/square_matrix.h b/lib/math/square_matrix.h index df07572..4090b47 100644 --- a/lib/math/square_matrix.h +++ b/lib/math/square_matrix.h @@ -3,7 +3,7 @@ #include // size_t -#include "algorithm/permutation/permutation_manager.h" +#include "core/genotypes.h" namespace tachyon{ namespace math{ @@ -11,8 +11,8 @@ namespace math{ template class SquareMatrix{ private: - typedef SquareMatrix self_type; - typedef algorithm::PermutationManager ppa_type; + typedef SquareMatrix self_type; + typedef yon_gt_ppa ppa_type; public: SquareMatrix(const U32 width) : diff --git a/lib/stats.h b/lib/stats.h index e372f5c..8002fd3 100644 --- a/lib/stats.h +++ b/lib/stats.h @@ -36,8 +36,7 @@ void stats_usage(void){ int stats(int argc, char** argv){ if(argc <= 2){ - programMessage(); - programHelpDetailed(); + programHelp(); return(1); } @@ -97,12 +96,6 @@ int stats(int argc, char** argv){ tachyon::VariantReader reader; - // temp - if(keychain_file.size()){ - if(reader.loadKeychainFile(keychain_file) == false) - return 1; - } - if(!reader.open(input)){ std::cerr << "failed to open" << std::endl; return 1; @@ -111,6 +104,7 @@ int stats(int argc, char** argv){ tachyon::algorithm::Timer timer, timer2; timer.Start(); timer2.Start(); + /* reader.getBlockSettings().contig(true, true); reader.getBlockSettings().positions(true, true); reader.getBlockSettings().controller(true, true); @@ -118,19 +112,20 @@ int stats(int argc, char** argv){ reader.getBlockSettings().loadGenotypes(true); reader.getBlockSettings().ppa(true, true); reader.getBlockSettings().alleles(true, true); + */ U32 block_counter = 0; - std::vector global_titv(reader.getGlobalHeader().getSampleNumber()); - while(reader.nextBlock()){ - reader.getTiTVRatios(std::cout, global_titv); + std::vector global_titv(reader.GetGlobalHeader().GetNumberSamples()); + while(reader.NextBlock()){ + //reader.getTiTVRatios(global_titv); //reader.getGenotypeSummary(std::cout); - std::cerr << block_counter++ << "/" << reader.getIndex().size() << " in " << timer.ElapsedString() << " " << timer2.ElapsedString() << " " << timer2.Elapsed().count()/(block_counter+1)*reader.getIndex().size() << std::endl; + std::cerr << block_counter++ << "/" << reader.GetIndex().size() << " in " << timer.ElapsedString() << " " << timer2.ElapsedString() << " " << timer2.Elapsed().count()/(block_counter+1)*reader.GetIndex().size() << std::endl; timer.Start(); } std::cout << "Sample\tTransversions\tTransitions\tTiTV\tAA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\ttotalVariants\tn_insertions\n"; for(U32 i = 0; i < global_titv.size(); ++i){ - std::cout << reader.getGlobalHeader().samples[i].name << '\t' << global_titv[i] << '\n'; + std::cout << reader.GetGlobalHeader().samples_[i] << '\t' << global_titv[i] << '\n'; } return 0; diff --git a/lib/support/enums.h b/lib/support/enums.h index d1d0a05..02c3bd2 100644 --- a/lib/support/enums.h +++ b/lib/support/enums.h @@ -40,7 +40,8 @@ enum TACHYON_GT_ENCODING{ YON_GT_RLE_DIPLOID_BIALLELIC = 0,//!< YON_GT_RLE_DIPLOID_BIALLELIC YON_GT_RLE_DIPLOID_NALLELIC = 1,//!< YON_GT_RLE_DIPLOID_NALLELIC YON_GT_BCF_DIPLOID = 2,//!< YON_GT_BCF_DIPLOID - YON_GT_BCF_STYLE = 3 //!< YON_GT_BCF_STYLE + YON_GT_BCF_STYLE = 3,//!< YON_GT_BCF_STYLE + YON_GT_RLE_NPLOID = 4 }; /**< diff --git a/lib/support/helpers.cpp b/lib/support/helpers.cpp index b2a89b5..3dfcc6c 100644 --- a/lib/support/helpers.cpp +++ b/lib/support/helpers.cpp @@ -193,5 +193,18 @@ bool HexToBytes(const std::string& hex, uint8_t* target){ return true; } +void SerializeString(const std::string& string, std::ostream& stream){ + size_t size_helper = string.size(); + stream.write((const char*)&size_helper, sizeof(size_t)); + stream.write(string.data(), string.size()); +} + +void DeserializeString(std::string& string, std::istream& stream){ + size_t l_string; + stream.read((char*)&l_string, sizeof(size_t)); + string.resize(l_string); + stream.read(&string[0], l_string); +} + } } diff --git a/lib/support/helpers.h b/lib/support/helpers.h index 7dd51c8..37c9b3f 100644 --- a/lib/support/helpers.h +++ b/lib/support/helpers.h @@ -6,15 +6,18 @@ #include #include #include -#include #include #include +#include #include "type_definitions.h" +#include "io/basic_buffer.h" namespace tachyon{ namespace utility{ +const std::regex YON_VARIANT_STANDARD("^[ATGC]{1,}$"); + int isBigEndian(void); std::vector &split(std::string &s, char delim, std::vector &elems); @@ -92,6 +95,19 @@ std::string toPrettyDiskString(const T value){ return(std::to_string(value) + " B"); } +void SerializeString(const std::string& string, std::ostream& stream); +void DeserializeString(std::string& string, std::istream& stream); + +template +void SerializePrimitive(const T& value, std::ostream& stream){ + stream.write((const char*)&value, sizeof(T)); +} + +template +void DeserializePrimitive(T& value, std::istream& stream){ + stream.read((char*)&value, sizeof(T)); +} + } } diff --git a/lib/support/magic_constants.h b/lib/support/magic_constants.h index eb2b6b2..ee536b5 100644 --- a/lib/support/magic_constants.h +++ b/lib/support/magic_constants.h @@ -16,7 +16,7 @@ extern std::string INTERPRETED_COMMAND; /*------ Version ------*/ const S32 TACHYON_VERSION_MAJOR = 0; -const S32 TACHYON_VERSION_MINOR = 1; +const S32 TACHYON_VERSION_MINOR = 3; const S32 TACHYON_VERSION_PATCH = 0; const S32 TACHYON_VERSION_NUMBER = (TACHYON_VERSION_MAJOR *100*100 + TACHYON_VERSION_MINOR *100 + TACHYON_VERSION_PATCH); const std::string TACHYON_LIB_VERSION = std::to_string(TACHYON_VERSION_MAJOR) + '.' + std::to_string(TACHYON_VERSION_MINOR) + '.' + std::to_string(TACHYON_VERSION_PATCH); diff --git a/lib/third_party/zlib b/lib/third_party/zlib deleted file mode 160000 index cacf7f1..0000000 --- a/lib/third_party/zlib +++ /dev/null @@ -1 +0,0 @@ -Subproject commit cacf7f1d4e3d44d871b605da3b647f07d718623f diff --git a/lib/utility.h b/lib/utility.h index 4b85db7..6405654 100644 --- a/lib/utility.h +++ b/lib/utility.h @@ -3,6 +3,7 @@ #include +#include #include #include #include @@ -10,33 +11,55 @@ #include "support/helpers.h" #include "support/magic_constants.h" -// Declare extern +// These utility strings have been declared extern so that +// they can be used globally throughout the program. +// The LITERAL_COMMAND_LINE string is populated with the literal +// character input from the ABI to the program. +// The INTERPRETED_COMMAND string is the internal dump of the +// settings used for a subroutine. This is very useful for +// debugging and legacy purposes. std::string tachyon::constants::LITERAL_COMMAND_LINE; std::string tachyon::constants::INTERPRETED_COMMAND; +/**< + * Print a standard program message to standard out. This message + * describes the current git version, library version, and linked + * library versions. + * @param separator Boolean regulating the presence/absence of a dashed delimiter line after the program message. + */ void programMessage(const bool separator = true){ - std::cerr << "Program: " << tachyon::constants::PROGRAM_NAME << " " << VERSION << std::endl; - std::cerr << "Libraries: " << tachyon::constants::PROGRAM_NAME << '-' << tachyon::constants::TACHYON_LIB_VERSION << "; " - << SSLeay_version(SSLEAY_VERSION) << "; " - << "ZSTD-" << ZSTD_versionString() << std::endl; + // The OpenSSL version string generally returns a visually unappealing format + // that includes placeholder X's. These are generally preceded by two single + // blank spaces. If we find two consecutive blank spaces then present the + // substring from [0, match) left-inclusive. + std::string openssl_version_string = SSLeay_version(SSLEAY_VERSION); + size_t match = openssl_version_string.find(" "); + if(match != std::string::npos) openssl_version_string = openssl_version_string.substr(0, match); + + // General message for program version, git version, and linked library versions. + std::cerr << "Program: " << tachyon::constants::PROGRAM_NAME << "-" << VERSION << " (Tools for querying and manipulating variant call data)" << std::endl; + std::cerr << "Libraries: " << tachyon::constants::PROGRAM_NAME << '-' << tachyon::constants::TACHYON_LIB_VERSION + << "; " << openssl_version_string + << "; ZSTD-" << ZSTD_versionString() + << "; htslib " << std::string(hts_version()) << std::endl; std::cerr << "Contact: Marcus D. R. Klarqvist " << std::endl; std::cerr << "Documentation: https://github.com/mklarqvist/tachyon" << std::endl; std::cerr << "License: MIT" << std::endl; if(separator) std::cerr << "----------" << std::endl; } +/**< + * Extension of the programMessage function. First prints the + * program message to standard out and then prints out the available + * command names and their descriptions. + */ void programHelp(void){ - std::cerr << "Usage: " << tachyon::constants::PROGRAM_NAME << " [--version] [--help] " << std::endl; - std::cerr << "Commands: import, view" << std::endl; -} - -void programHelpDetailed(void){ - programHelp(); - std::cerr << - "\n" - "import import VCF/BCF to YON\n" - "view YON->VCF/BCF conversion, YON subset and filter\n" - "check comprehensive file integrity checks\n" << std::endl; + programMessage(false); + std::cerr << "\nUsage: " << tachyon::constants::PROGRAM_NAME << " [--version] [--help] " + "\n\n" + "Commands:\n" + "import import VCF/BCF to YON\n" + "view YON->VCF/BCF conversion, YON subset and filter\n" << std::endl; } #endif /* TACHYON_UTILITY_H_ */ diff --git a/lib/utility/support_vcf.cpp b/lib/utility/support_vcf.cpp index d936643..d27a041 100644 --- a/lib/utility/support_vcf.cpp +++ b/lib/utility/support_vcf.cpp @@ -3,250 +3,73 @@ namespace tachyon{ namespace utility{ -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - stream << container[0]; - for(U32 i = 1; i < container.size(); ++i) - stream << ',' << (U32)container[i]; - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - stream << container[0]; - for(U32 i = 1; i < container.size(); ++i) - stream << ',' << (U32)container[i]; - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - stream << container[0]; - for(U32 i = 1; i < container.size(); ++i) - stream << ',' << container[i]; - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - stream << container[0]; - for(U32 i = 1; i < container.size(); ++i) - stream << ',' << container[i]; - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - const BYTE* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_BYTE_EOV) - return(stream.put('.')); - - // First value - if(ref[0] == YON_BYTE_MISSING) stream << '.'; - else stream << (S32)container[0]; - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_BYTE_MISSING) stream << ",."; - else if(ref[i] == YON_BYTE_EOV){ return stream; } - else stream << ',' << (S32)container[i]; - } - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - const U16* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_SHORT_EOV) - return(stream.put('.')); - - // First value - if(ref[0] == YON_SHORT_MISSING) stream << '.'; - else stream << (S32)container[0]; - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_SHORT_MISSING) stream << ",."; - else if(ref[i] == YON_SHORT_EOV){ return stream; } - else stream << ',' << (S32)container[i]; - } - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - const U32* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_INT_EOV){ - return(stream.put('.')); - } - - // First value - if(ref[0] == YON_INT_MISSING) stream << '.'; - else stream << container[0]; - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_INT_MISSING) stream << ",."; - else if(ref[i] == YON_INT_EOV){ return stream; } - else stream << ',' << container[i]; - } - - return(stream); -} - -// Special case -std::ostream& to_vcf_string_char(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - stream << container[0]; - for(U32 i = 1; i < container.size(); ++i) - stream << ',' << container[i]; - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - const U32* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_FLOAT_EOV) - return(stream.put('.')); - - // First value - if(ref[0] == YON_FLOAT_MISSING) stream << '.'; - else stream << container[0]; - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_FLOAT_MISSING) stream << ",."; - else if(ref[i] == YON_FLOAT_EOV){ return stream; } - else stream << ',' << container[i]; - } - - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container){ - if(container.size() == 0) - return(stream.put('.')); - - const U32* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_FLOAT_EOV) - return(stream.put('.')); - - // First value - if(ref[0] == YON_FLOAT_MISSING) stream << '.'; - else stream << container[0]; - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_FLOAT_MISSING) stream << ",."; - else if(ref[i] == YON_FLOAT_EOV){ return stream; } - else stream << ',' << container[i]; - } - - return(stream); -} - -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const BYTE* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - buffer.AddReadble((U32)container[0]); - for(U32 i = 1; i < container.size(); ++i){ + buffer.AddReadble((U32)data[0]); + for(U32 i = 1; i < n_data; ++i){ buffer += ','; - buffer.AddReadble((U32)container[i]); + buffer.AddReadble((U32)data[i]); } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U16* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - buffer.AddReadble(container[0]); - for(U32 i = 1; i < container.size(); ++i){ + buffer.AddReadble(data[0]); + for(U32 i = 1; i < n_data; ++i){ buffer += ','; - buffer.AddReadble(container[i]); + buffer.AddReadble(data[i]); } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U32* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - buffer.AddReadble(container[0]); - for(U32 i = 1; i < container.size(); ++i){ + buffer.AddReadble(data[0]); + for(U32 i = 1; i < n_data; ++i){ buffer += ','; - buffer.AddReadble(container[i]); + buffer.AddReadble(data[i]); } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const U64* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - buffer.AddReadble(container[0]); - for(U32 i = 1; i < container.size(); ++i){ + buffer.AddReadble(data[0]); + for(U32 i = 1; i < n_data; ++i){ buffer += ','; - buffer.AddReadble(container[i]); + buffer.AddReadble(data[i]); } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const SBYTE* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - const BYTE* const ref = reinterpret_cast(container.data()); + const BYTE* const ref = reinterpret_cast(data); // If the first value is end-of-vector then return if(ref[0] == YON_BYTE_EOV){ @@ -256,28 +79,28 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::Primit // First value if(ref[0] == YON_BYTE_MISSING) buffer += '.'; - else buffer.AddReadble((S32)container[0]); + else buffer.AddReadble((S32)data[0]); // Remainder values - for(U32 i = 1; i < container.size(); ++i){ + for(U32 i = 1; i < n_data; ++i){ if(ref[i] == YON_BYTE_MISSING) buffer += ",."; else if(ref[i] == YON_BYTE_EOV){ return buffer; } else { buffer += ','; - buffer.AddReadble((S32)container[i]); + buffer.AddReadble((S32)data[i]); } } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const S16* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - const U16* const ref = reinterpret_cast(container.data()); + const U16* const ref = reinterpret_cast(data); // If the first value is end-of-vector then return if(ref[0] == YON_SHORT_EOV){ @@ -287,28 +110,28 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::Primit // First value if(ref[0] == YON_SHORT_MISSING) buffer += '.'; - else buffer.AddReadble((S32)container[0]); + else buffer.AddReadble((S32)data[0]); // Remainder values - for(U32 i = 1; i < container.size(); ++i){ + for(U32 i = 1; i < n_data; ++i){ if(ref[i] == YON_SHORT_MISSING) buffer += ",."; else if(ref[i] == YON_SHORT_EOV){ return buffer; } else { buffer += ','; - buffer.AddReadble((S32)container[i]); + buffer.AddReadble((S32)data[i]); } } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const S32* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - const U32* const ref = reinterpret_cast(container.data()); + const U32* const ref = reinterpret_cast(data); // If the first value is end-of-vector then return if(ref[0] == YON_INT_EOV){ @@ -318,15 +141,15 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::Primit // First value if(ref[0] == YON_INT_MISSING) buffer += '.'; - else buffer.AddReadble(container[0]); + else buffer.AddReadble(data[0]); // Remainder values - for(U32 i = 1; i < container.size(); ++i){ + for(U32 i = 1; i < n_data; ++i){ if(ref[i] == YON_INT_MISSING) buffer += ",."; else if(ref[i] == YON_INT_EOV){ return buffer; } else { buffer += ','; - buffer.AddReadble(container[i]); + buffer.AddReadble(data[i]); } } @@ -334,28 +157,28 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::Primit } // Special case -io::BasicBuffer& to_vcf_string_char(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string_char(io::BasicBuffer& buffer, const char* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - buffer += container[0]; - for(U32 i = 1; i < container.size(); ++i){ + buffer += data[0]; + for(U32 i = 1; i < n_data; ++i){ buffer += ','; - buffer += container[i]; + buffer += data[i]; } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const float* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - const U32* const ref = reinterpret_cast(container.data()); + const U32* const ref = reinterpret_cast(data); // If the first value is end-of-vector then return if(ref[0] == YON_FLOAT_EOV){ @@ -365,28 +188,28 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::Primit // First value if(ref[0] == YON_FLOAT_MISSING) buffer += '.'; - else buffer.AddReadble(container[0]); + else buffer.AddReadble(data[0]); // Remainder values - for(U32 i = 1; i < container.size(); ++i){ + for(U32 i = 1; i < n_data; ++i){ if(ref[i] == YON_FLOAT_MISSING) buffer += ",."; else if(ref[i] == YON_FLOAT_EOV){ return buffer; } else { buffer += ','; - buffer.AddReadble(container[i]); + buffer.AddReadble(data[i]); } } return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const double* data, const size_t n_data){ + if(n_data == 0){ buffer += '.'; return(buffer); } - const U32* const ref = reinterpret_cast(container.data()); + const U32* const ref = reinterpret_cast(data); // If the first value is end-of-vector then return if(ref[0] == YON_FLOAT_EOV){ @@ -396,368 +219,28 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::Primit // First value if(ref[0] == YON_FLOAT_MISSING) buffer += '.'; - else buffer.AddReadble(container[0]); + else buffer.AddReadble(data[0]); // Remainder values - for(U32 i = 1; i < container.size(); ++i){ + for(U32 i = 1; i < n_data; ++i){ if(ref[i] == YON_FLOAT_MISSING) buffer += ",."; else if(ref[i] == YON_FLOAT_EOV){ return buffer; } else { buffer += ','; - buffer.AddReadble(container[i]); - } - } - - return(buffer); -} - -/////////// -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - if(container.size() == 1){ - buffer.AddReadble((U32)container[0]); - } else { - buffer += '['; - buffer.AddReadble((U32)container[0]); - for(U32 i = 1; i < container.size(); ++i){ - buffer += ','; - buffer.AddReadble((U32)container[i]); - } - buffer += ']'; - } - - return(buffer); -} - -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - if(container.size() == 1){ - buffer.AddReadble(container[0]); - } else { - buffer += '['; - buffer.AddReadble(container[0]); - for(U32 i = 1; i < container.size(); ++i){ - buffer += ','; - buffer.AddReadble(container[i]); + buffer.AddReadble(data[i]); } - buffer += ']'; } return(buffer); } -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - if(container.size() == 1){ - buffer.AddReadble(container[0]); - } else { - buffer += '['; - buffer.AddReadble(container[0]); - for(U32 i = 1; i < container.size(); ++i){ - buffer += ','; - buffer.AddReadble(container[i]); - } - buffer += ']'; - } - - return(buffer); -} - -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - if(container.size() == 1){ - buffer.AddReadble(container[0]); - } else { - buffer += '['; - buffer.AddReadble(container[0]); - for(U32 i = 1; i < container.size(); ++i){ - buffer += ','; - buffer.AddReadble(container[i]); - } - buffer += ']'; - } - - return(buffer); -} - -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - const BYTE* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_BYTE_EOV){ - buffer += "null"; - return(buffer); - } - - // First value - if(container.size() == 1){ - if(ref[0] == YON_BYTE_MISSING) buffer += "null"; - else buffer.AddReadble((S32)container[0]); - return(buffer); - } - - buffer += '['; - if(ref[0] == YON_BYTE_MISSING) buffer += "null"; - else buffer.AddReadble((S32)container[0]); - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_BYTE_MISSING) buffer += ",null"; - else if(ref[i] == YON_BYTE_EOV){ return buffer; } - else { - buffer += ','; - buffer.AddReadble((S32)container[i]); - } - } - buffer += ']'; - - return(buffer); -} - -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - const U16* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_SHORT_EOV){ - buffer += "null"; - return(buffer); - } - - // First value - if(container.size() == 1){ - if(ref[0] == YON_SHORT_MISSING) buffer += "null"; - else buffer.AddReadble((S32)container[0]); - return(buffer); - } - - buffer += '['; - if(ref[0] == YON_SHORT_MISSING) buffer += "null"; - else buffer.AddReadble((S32)container[0]); - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_SHORT_MISSING) buffer += ",null"; - else if(ref[i] == YON_SHORT_EOV){ return buffer; } - else { - buffer += ','; - buffer.AddReadble((S32)container[i]); - } - } - buffer += ']'; - - return(buffer); -} - -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += '.'; - return(buffer); - } - - const U32* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_INT_EOV){ - buffer += "null"; - return(buffer); - } - - // First value - if(container.size() == 1){ - if(ref[0] == YON_INT_MISSING) buffer += "null"; - else buffer.AddReadble((S32)container[0]); - return(buffer); - } - - buffer += '['; - if(ref[0] == YON_INT_MISSING) buffer += "null"; - else buffer.AddReadble((S32)container[0]); - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_INT_MISSING) buffer += ",null"; - else if(ref[i] == YON_INT_EOV){ return buffer; } - else { - buffer += ','; - buffer.AddReadble((S32)container[i]); - } - } - buffer += ']'; - - return(buffer); -} - -// Special case -io::BasicBuffer& to_json_string_char(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - if(container.size() == 1){ - buffer += container[0]; - return(buffer); - } - - buffer += '['; - buffer += container[0]; - for(U32 i = 1; i < container.size(); ++i){ - buffer += ','; - buffer += container[i]; - } - buffer += ']'; - - return(buffer); -} - -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - const U32* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_FLOAT_EOV){ - buffer += "null"; - return(buffer); - } - - // First value - if(container.size() == 1){ - if(ref[0] == YON_FLOAT_MISSING) buffer += "null"; - else buffer.AddReadble(container[0]); - return(buffer); - } - - buffer += '['; - if(ref[0] == YON_FLOAT_MISSING) buffer += "null"; - else buffer.AddReadble(container[0]); - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_FLOAT_MISSING) buffer += ",null"; - else if(ref[i] == YON_FLOAT_EOV){ return buffer; } - else { - buffer += ','; - buffer.AddReadble(container[i]); - } - } - buffer += ']'; - - return(buffer); -} - -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container){ - if(container.size() == 0){ - buffer += "null"; - return(buffer); - } - - const U32* const ref = reinterpret_cast(container.data()); - - // If the first value is end-of-vector then return - if(ref[0] == YON_FLOAT_EOV){ - buffer += "null"; - return(buffer); - } - - // First value - if(container.size() == 1){ - if(ref[0] == YON_FLOAT_MISSING) buffer += "null"; - else buffer.AddReadble(container[0]); - return(buffer); - } - - buffer += '['; - if(ref[0] == YON_FLOAT_MISSING) buffer += "null"; - else buffer.AddReadble(container[0]); - - // Remainder values - for(U32 i = 1; i < container.size(); ++i){ - if(ref[i] == YON_FLOAT_MISSING) buffer += ",null"; - else if(ref[i] == YON_FLOAT_EOV){ return buffer; } - else { - buffer += ','; - buffer.AddReadble(container[i]); - } - } - buffer += ']'; - - return(buffer); -} - -/// - -std::ostream& to_vcf_string(std::ostream& stream, const core::GTObject& gt_object){ - if(gt_object.n_ploidy == 0) - return(stream); - - for(U32 i = 0; i < gt_object.n_objects; ++i){ - stream << (int)gt_object[0].allele << (gt_object[i].phase ? '|' : '/') << (int)gt_object[1].allele; - for(U32 j = 1; j < gt_object.n_ploidy; ++j){ - stream << '\t' << (int)gt_object[0].allele << (gt_object[0].phase ? '|' : '/') << (int)gt_object[1].allele; - } - } - return(stream); -} - -std::ostream& to_vcf_string(std::ostream& stream, const std::vector& gt_objects){ - if(gt_objects.size() == 0) - return(stream); - - stream << (int)gt_objects[0][0].allele << (gt_objects[0][0].phase ? '|' : '/') << (int)gt_objects[0][1].allele; - - for(U32 element = 1; element < gt_objects.size(); ++element){ - if(gt_objects[element].n_ploidy == 0) - continue; - - stream << '\t'; - - for(U32 object = 0; object < gt_objects[element].n_objects; ++object){ - stream << (int)gt_objects[element][0].allele << (gt_objects[element][0].phase ? '|' : '/') << (int)gt_objects[element][1].allele; - for(U32 k = 1; k < gt_objects[element].n_ploidy; ++k){ - stream << '\t' << (int)gt_objects[element][k].allele << (gt_objects[element][k].phase ? '|' : '/') << (int)gt_objects[element][k].allele; - } - } - } - return(stream); -} - std::ostream& to_vcf_string(std::ostream& stream, const std::string& string){ stream << string; return(stream); } -std::ostream& to_vcf_string(std::ostream& stream, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header){ - stream.write(&header.getContig(meta_entry.contigID).name[0], header.getContig(meta_entry.contigID).name.size()) << '\t'; +std::ostream& to_vcf_string(std::ostream& stream, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header){ + stream.write(&header.GetContig(meta_entry.contigID)->name[0], header.GetContig(meta_entry.contigID)->name.size()) << '\t'; stream << meta_entry.position + 1 << delimiter; if(meta_entry.name.size() == 0) stream.put('.'); @@ -788,8 +271,8 @@ std::ostream& to_vcf_string(std::ostream& stream, const char& delimiter, const c return(stream); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header){ - buffer += header.getContig(meta_entry.contigID).name; +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header){ + buffer += header.GetContig(meta_entry.contigID)->name; buffer += delimiter; buffer.AddReadble(meta_entry.position + 1); buffer += delimiter; @@ -819,27 +302,26 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, c else { buffer.AddReadble(meta_entry.quality); } - buffer += delimiter; return(buffer); } -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header, const DataBlockSettings& controller){ - if(controller.contig.display){ - buffer += header.getContig(meta_entry.contigID).name; +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header, const DataBlockSettings& controller){ + //if(controller.contig.display){ + buffer += header.GetContig(meta_entry.contigID)->name; buffer += delimiter; - } + //} - if(controller.positions.display){ + //if(controller.positions.display){ buffer.AddReadble(meta_entry.position + 1); buffer += delimiter; - } + //} - if(controller.names.display){ + //if(controller.names.display){ if(meta_entry.name.size() == 0) buffer += '.'; else buffer += meta_entry.name; buffer += delimiter; - } + //} if(controller.display_ref){ if(meta_entry.n_alleles) buffer.Add(meta_entry.alleles[0].allele, meta_entry.alleles[0].l_allele); @@ -860,37 +342,37 @@ io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, c buffer += delimiter; } - if(controller.quality.display){ + //if(controller.quality.display){ if(std::isnan(meta_entry.quality)) buffer += '.'; else buffer.AddReadble(meta_entry.quality); buffer += delimiter; - } + //} return(buffer); } -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const core::MetaEntry& meta_entry, const core::VariantHeader& header, const DataBlockSettings& controller){ +io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const core::MetaEntry& meta_entry, const VariantHeader& header, const DataBlockSettings& controller){ return(utility::to_json_string(buffer, meta_entry, header, controller)); } -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header, const DataBlockSettings& controller){ +io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header, const DataBlockSettings& controller){ bool add = false; - if(controller.contig.display){ + //if(controller.contig.display){ buffer += "\"contig\":\""; - buffer += header.getContig(meta_entry.contigID).name; + buffer += header.GetContig(meta_entry.contigID)->name; buffer += '"'; add = true; - } + //} - if(controller.positions.display){ + //if(controller.positions.display){ if(add){ buffer += ','; add = false; } buffer += "\"position\":"; buffer.AddReadble(meta_entry.position + 1); add = true; - } + //} - if(controller.names.display){ + //if(controller.names.display){ if(add){ buffer += ','; add = false; } buffer += "\"name\":"; if(meta_entry.name.size() == 0) buffer += "null"; @@ -900,9 +382,9 @@ io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const char& delimiter, buffer += '"'; add = true; } - } + //} - if(controller.display_ref){ + //if(controller.display_ref){ if(add){ buffer += ','; add = false; } buffer += "\"ref\":"; if(meta_entry.n_alleles){ @@ -913,7 +395,7 @@ io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const char& delimiter, buffer += "null"; } add = true; - } + //} if(controller.display_alt){ if(add){ buffer += ','; add = false; } @@ -936,16 +418,214 @@ io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const char& delimiter, add = true; } - if(controller.quality.display){ + //if(controller.quality.display){ if(add){ buffer += ','; add = false; } buffer += "\"quality\":"; if(std::isnan(meta_entry.quality)) buffer += 0; else buffer.AddReadble(meta_entry.quality); add = true; - } + //} return(buffer); } +int32_t* FormatDataHtslib(const BYTE* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i) dst[i] = src[i]; + return(dst); +} + +int32_t* FormatDataHtslib(const U16* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i) dst[i] = src[i]; + return(dst); +} + +int32_t* FormatDataHtslib(const U32* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i) dst[i] = src[i]; + return(dst); +} + +int32_t* FormatDataHtslib(const U64* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i) dst[i] = src[i]; + return(dst); +} + +int32_t* FormatDataHtslib(const SBYTE* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i){ + if(src[i] == INT8_MIN) { dst[i] = bcf_int32_missing; } + else if(src[i] == INT8_MIN+1){ dst[i] = bcf_int32_vector_end; } + else dst[i] = src[i]; + } + return(dst); +} + +int32_t* FormatDataHtslib(const S16* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i){ + if(src[i] == INT16_MIN) { dst[i] = bcf_int32_missing; } + else if(src[i] == INT16_MIN+1){ dst[i] = bcf_int32_vector_end; } + else dst[i] = src[i]; + } + return(dst); +} + +int32_t* FormatDataHtslib(const S32* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i){ + if(src[i] == INT32_MIN) { dst[i] = bcf_int32_missing; } + else if(src[i] == INT32_MIN+1){ dst[i] = bcf_int32_vector_end; } + else dst[i] = src[i]; + } + return(dst); +} + +int32_t* FormatDataHtslib(const S64* const src, int32_t* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i) dst[i] = src[i]; + return(dst); +} + +int32_t* FormatDataHtslib(const float* const src, int32_t* dst, const size_t n_entries){ + std::cerr << utility::timestamp("ERROR") << "Cannot convert float into integer." << std::endl; + return(dst); +} + +int32_t* FormatDataHtslib(const double* const src, int32_t* dst, const size_t n_entries){ + std::cerr << utility::timestamp("ERROR") << "Cannot convert double into integer." << std::endl; + return(dst); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const BYTE* const data, + const size_t n_entries) +{ + int32_t* tmpi = new int32_t[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_int32(hdr, rec, tag.data(), &tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const U16* const data, + const size_t n_entries) +{ + int32_t* tmpi = new int32_t[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_int32(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const U32* const data, + const size_t n_entries) +{ + int32_t* tmpi = new int32_t[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_int32(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const U64* const data, + const size_t n_entries) +{ + int32_t* tmpi = new int32_t[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_int32(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const SBYTE* const data, + const size_t n_entries) +{ + int32_t* tmpi = new int32_t[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_int32(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const S16* const data, + const size_t n_entries) +{ + int32_t* tmpi = new int32_t[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_int32(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const S32* const data, + const size_t n_entries) +{ + bcf_update_info_int32(hdr, rec, tag.data(), data, n_entries); + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const S64* const data, + const size_t n_entries) +{ + int32_t* tmpi = new int32_t[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_int32(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const float* const data, + const size_t n_entries) +{ + float* tmpi = new float[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_float(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const double* const data, + const size_t n_entries) +{ + float* tmpi = new float[n_entries]; + FormatDataHtslib(data, tmpi, n_entries); + bcf_update_info_float(hdr, rec, tag.data(), tmpi, n_entries); + delete [] tmpi; + return(rec); +} + +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, + bcf_hdr_t* hdr, + const std::string& tag, + const std::string& data) +{ + bcf_update_info_string(hdr, rec, tag.data(), data.data()); + return(rec); +} + } } diff --git a/lib/utility/support_vcf.h b/lib/utility/support_vcf.h index 5339382..7954ef6 100644 --- a/lib/utility/support_vcf.h +++ b/lib/utility/support_vcf.h @@ -5,8 +5,9 @@ #include #include #include "support/type_definitions.h" -#include "containers/primitive_container.h" -#include "core/genotype_object.h" +//#include "containers/primitive_container.h" +#include "core/data_block_settings.h" +#include "core/meta_entry.h" namespace tachyon{ namespace utility{ @@ -22,66 +23,54 @@ namespace utility{ #define YON_FLOAT_EOV 0x7F800002 // Base functionality converting data to a valid VCF string -std::ostream& to_vcf_string(std::ostream& stream, const BYTE* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const U16* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const U32* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const U64* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const SBYTE* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const S16* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const S32* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const char* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const float* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const double* const values, const U32 n_entries); -std::ostream& to_vcf_string(std::ostream& stream, const std::string& string); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const BYTE* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const U16* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const U32* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const U64* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const SBYTE* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const S16* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const S32* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const char* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const float* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const double* const data, const size_t n_data); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& stream, const std::string& string); -// Primitive container declarations -// Unsigned values does not have missing values -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string_char(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); -std::ostream& to_vcf_string(std::ostream& stream, const containers::PrimitiveContainer& container); +std::ostream& to_vcf_string(std::ostream& stream, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header); +io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header, const DataBlockSettings& controller); +io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const core::MetaEntry& meta_entry, const VariantHeader& header, const DataBlockSettings& controller); +io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const VariantHeader& header, const DataBlockSettings& controller); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string_char(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const BYTE* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const U16* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const U32* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const U64* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const SBYTE* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const S16* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const S32* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const float* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const double* const data, const size_t n_entries); +bcf1_t* UpdateHtslibVcfRecordInfo(bcf1_t* rec, bcf_hdr_t* hdr, const std::string& tag, const std::string& data); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string_char(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const containers::PrimitiveContainer& container); +int32_t* FormatDataHtslib(const BYTE* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const U16* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const U32* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const U64* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const SBYTE* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const S16* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const S32* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const S64* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const float* const src, int32_t* dst, const size_t n_entries); +int32_t* FormatDataHtslib(const double* const src, int32_t* dst, const size_t n_entries); -// Genotype objects -std::ostream& to_vcf_string(std::ostream& stream, const core::GTObject& gt_object); -std::ostream& to_vcf_string(std::ostream& stream, const std::vector& gt_objects); +template +float* FormatDataHtslib(const T* const src, float* dst, const size_t n_entries){ + for(U32 i = 0; i < n_entries; ++i) dst[i] = src[i]; + return(dst); +} -std::ostream& to_vcf_string(std::ostream& stream, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header); -io::BasicBuffer& to_vcf_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header, const DataBlockSettings& controller); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const core::MetaEntry& meta_entry, const core::VariantHeader& header, const DataBlockSettings& controller); -io::BasicBuffer& to_json_string(io::BasicBuffer& buffer, const char& delimiter, const core::MetaEntry& meta_entry, const core::VariantHeader& header, const DataBlockSettings& controller); } } - - #endif /* UTILITY_SUPPORT_VCF_H_ */ diff --git a/lib/variant_importer.cpp b/lib/variant_importer.cpp index 6ed78ae..6eaa613 100644 --- a/lib/variant_importer.cpp +++ b/lib/variant_importer.cpp @@ -1,22 +1,15 @@ -#include "variant_importer.h" - #include #include -#include "algorithm/digest/variant_digest_manager.h" -#include "algorithm/encryption/encryption_decorator.h" +#include "variant_importer.h" #include "containers/checksum_container.h" -#include "core/footer/footer.h" namespace tachyon { -#define IMPORT_ASSERT 1 - VariantImporter::VariantImporter(const settings_type& settings) : settings_(settings), GT_available_(false), - writer(nullptr), - header(nullptr) + writer(nullptr) { } @@ -25,465 +18,479 @@ VariantImporter::~VariantImporter(){ delete this->writer; } -bool VariantImporter::Build(){ - std::ifstream temp(this->settings_.input_file, std::ios::binary | std::ios::in); - if(!temp.good()){ - std::cerr << utility::timestamp("ERROR", "IMPORT") << "Failed to open file (" << this->settings_.input_file << ")..." << std::endl; - return false; - } - char tempData[2]; - temp.read(&tempData[0], 2); - temp.close(); +void VariantImporter::clear(){ + this->vcf_container_.clear(); +} - if((BYTE)tempData[0] == io::constants::GZIP_ID1 && (BYTE)tempData[1] == io::constants::GZIP_ID2){ - if(!this->BuildBCF()){ - std::cerr << utility::timestamp("ERROR", "IMPORT") << "Failed build!" << std::endl; - return false; - } - } else { - std::cerr << utility::timestamp("ERROR", "IMPORT") << "Unknown file format!" << std::endl; +bool VariantImporter::Build(){ + if(!this->BuildVCF()){ + std::cerr << utility::timestamp("ERROR", "IMPORT") << "Failed build!" << std::endl; return false; } return true; } -bool VariantImporter::BuildBCF(void){ - bcf_reader_type bcf_reader; - if(!bcf_reader.open(this->settings_.input_file)){ - std::cerr << utility::timestamp("ERROR", "BCF") << "Failed to open BCF file..." << std::endl; +bool VariantImporter::BuildVCF(void){ + // Retrieve a unique VcfReader. + this->vcf_reader_ = io::VcfReader::FromFile(this->settings_.input_file); + if(this->vcf_reader_ == nullptr){ return false; } - encryption::EncryptionDecorator encryption_manager; - encryption::Keychain<> keychain; + for(U32 i = 0; i < this->vcf_reader_->vcf_header_.contigs_.size(); ++i){ + if(this->vcf_reader_->vcf_header_.contigs_[i].n_bases == 0){ + std::cerr << utility::timestamp("NOTICE") << "No length declared for contig. Setting to INT32_MAX." << std::endl; + this->vcf_reader_->vcf_header_.contigs_[i].n_bases = std::numeric_limits::max(); + } + } - this->header = &bcf_reader.header; + // Remap the global IDX fields in Vcf to the appropriate incremental order. + // This is useful in the situations when fields have been removed or added + // to the Vcf header section without reformatting the file. + for(U32 i = 0; i < this->vcf_reader_->vcf_header_.contigs_.size(); ++i) + this->contig_reorder_map_[this->vcf_reader_->vcf_header_.contigs_[i].idx] = i; - // Spawn RLE controller and update PPA controller - this->encoder.setSamples(this->header->samples); - this->block.ppa_manager.setSamples(this->header->samples); - this->permutator.manager = &this->block.ppa_manager; - this->permutator.setSamples(this->header->samples); + for(U32 i = 0; i < this->vcf_reader_->vcf_header_.info_fields_.size(); ++i) + this->info_reorder_map_[this->vcf_reader_->vcf_header_.info_fields_[i].idx] = i; - if(this->settings_.output_prefix.size() == 0) this->writer = new writer_stream_type; - else this->writer = new writer_file_type; + for(U32 i = 0; i < this->vcf_reader_->vcf_header_.format_fields_.size(); ++i) + this->format_reorder_map_[this->vcf_reader_->vcf_header_.format_fields_[i].idx] = i; - if(!this->writer->open(this->settings_.output_prefix)){ - std::cerr << utility::timestamp("ERROR", "WRITER") << "Failed to open writer..." << std::endl; - return false; - } + for(U32 i = 0; i < this->vcf_reader_->vcf_header_.filter_fields_.size(); ++i) + this->filter_reorder_map_[this->vcf_reader_->vcf_header_.filter_fields_[i].idx] = i; - //index::VariantIndex idx; - for(U32 i = 0; i < this->header->contigs.size(); ++i){ - const U64 contig_length = this->header->contigs[i].bp_length; - BYTE n_levels = 7; - U64 bins_lowest = pow(4,n_levels); - double used = ( bins_lowest - (contig_length % bins_lowest) ) + contig_length; - - if(used / bins_lowest < 2500){ - for(S32 i = n_levels; i != 0; --i){ - if(used/pow(4,i) > 2500){ - n_levels = i; - break; - } - } - } + // Predicate of a search for "GT" FORMAT field in the Vcf header. + this->GT_available_ = (this->vcf_reader_->vcf_header_.GetFormat("GT") != nullptr); - this->writer->index.index_.add(i, contig_length, n_levels); - //std::cerr << "contig: " << this->header->contigs[i].name << "(" << i << ")" << " -> " << contig_length << " levels: " << (int)n_levels << std::endl; - //std::cerr << "idx size:" << idx.size() << " at " << this->writer->index.variant_index_[i].size() << std::endl; - //std::cerr << i << "->" << this->header->contigs[i].name << ":" << contig_length << " up to " << (U64)used << " width (bp) lowest level: " << used/pow(4,n_levels) << "@level: " << (int)n_levels << std::endl; - } + // Predicate of a search for "END" INFO field in the Vcf header. + io::VcfInfo* vcf_info_end = this->vcf_reader_->vcf_header_.GetInfo("END"); + if(vcf_info_end != nullptr) + this->settings_.info_end_key = vcf_info_end->idx; - // Writer MAGIC - this->writer->stream->write(&tachyon::constants::FILE_HEADER[0], tachyon::constants::FILE_HEADER_LENGTH); - // Convert VCF header to Tachyon heeader - core::VariantHeader header(*this->header); - header.literals += "\n##tachyon_importVersion=" + tachyon::constants::PROGRAM_NAME + "-" + VERSION + ";"; - header.literals += "libraries=" + tachyon::constants::PROGRAM_NAME + '-' + tachyon::constants::TACHYON_LIB_VERSION + "," - + SSLeay_version(SSLEAY_VERSION) + "," + "ZSTD-" + ZSTD_versionString() + "; timestamp=" + utility::datetime(); - header.literals += "\n" + this->settings_.getInterpretedString(); - - if(this->settings_.encrypt_data) header.literals += " -k"; - if(this->settings_.permute_genotypes) header.literals += " -P"; - else header.literals += " -p"; - header.header_magic.l_literals = header.literals.size(); - - // Convert header to byte stream, compress, and write to file - containers::DataContainer header_data; - header_data.resize(65536 + header.literals.size()*2); - header_data.buffer_data_uncompressed << header; - this->compression_manager.zstd_codec.compress(header_data); - *this->writer->stream << header_data.header; // write header - *this->writer->stream << header_data.buffer_data; - - // Search for GT field in the header - this->GT_available_ = header.has_format_field("GT"); - for(U32 i = 0; i < this->header->format_map.size(); ++i){ - if(this->header->format_map[i].ID == "GT"){ - bcf_reader.map_gt_id = this->header->format_map[i].IDX; - } + // Allocate a new writer. + if(this->settings_.output_prefix.size() == 0 || + (this->settings_.output_prefix.size() == 1 && this->settings_.output_prefix == "-")) + { + this->writer = new writer_stream_type; } + else this->writer = new writer_file_type; - // Search for END field in the header - for(U32 i = 0; i < this->header->info_map.size(); ++i){ - if(this->header->info_map[i].ID == "END"){ - this->settings_.info_end_key = this->header->info_map[i].IDX; - //std::cerr << "Found END at: " << this->header->info_map[i].IDX << std::endl; - } + // Open a file handle or standard out for writing. + if(!this->writer->open(this->settings_.output_prefix)){ + std::cerr << utility::timestamp("ERROR", "WRITER") << "Failed to open writer..." << std::endl; + return false; } - // Search for END field in the header - for(U32 i = 0; i < this->header->info_map.size(); ++i){ - if(this->header->info_map[i].ID == "SVLEN"){ - this->settings_.info_svlen_key = this->header->info_map[i].IDX; - //std::cerr << "Found SVLEN at: " << this->header->info_map[i].IDX << std::endl; - } - } + // Setup the encryption container. + encryption::EncryptionDecorator encryption_manager; + encryption::Keychain<> keychain; - // Set flag if genotypes are available - this->block.header.controller.hasGT = this->GT_available_; + // Setup the checksums container. + algorithm::VariantDigestManager checksums(YON_BLK_N_STATIC + 1, // Add one for global checksum. + this->vcf_reader_->vcf_header_.info_fields_.size() + 1, + this->vcf_reader_->vcf_header_.format_fields_.size() + 1); + + // The index needs to know how many contigs that's described in the + // Vcf header and their lenghts. This information is needed to construct + // the linear and quad-tree index most appropriate for the data. + this->writer->index.Add(this->vcf_reader_->vcf_header_.contigs_); + + // Write out a fresh Tachyon header with the data from the Vcf header. As + // this data will not be modified during the import stage it is safe to + // write out now. + this->writer->stream->write(&constants::FILE_HEADER[0], constants::FILE_HEADER_LENGTH); // Todo: fix + this->WriteYonHeader(); + + // Setup genotype permuter and genotype encoder. + this->permutator.SetSamples(this->vcf_reader_->vcf_header_.GetNumberSamples()); + this->encoder.SetSamples(this->vcf_reader_->vcf_header_.GetNumberSamples()); + + // Allocate containers and offsets for this file. + // This is not strictly necessary but prevents nasty resize + // calls in most cases. + this->block.Allocate(this->vcf_reader_->vcf_header_.info_fields_.size(), + this->vcf_reader_->vcf_header_.format_fields_.size(), + this->vcf_reader_->vcf_header_.filter_fields_.size()); // Resize containers const U32 resize_to = this->settings_.checkpoint_n_snps * sizeof(U32) * 2; // small initial allocation this->block.resize(resize_to); - // Digest controller - algorithm::VariantDigestManager checksums(25, this->header->info_map.size(), this->header->format_map.size()); - - // Start import - U32 previous_first = 0; - U32 previous_last = 0; - S32 previous_contig_ID = -1; - - // Begin import - // Get BCF entries + // Start porgress timer. algorithm::Timer timer; timer.Start(); - if(!SILENT){ - std::cerr << utility::timestamp("PROGRESS") << - std::setfill(' ') << std::setw(10) << "Variants" << ' ' << - std::setfill(' ') << std::setw(10) << "Written" << '\t' << - std::setfill(' ') << std::setw(8) << "Completion" << ' ' << - "Elapsed " << "Contig:from->to" << std::endl; - } - bcf_reader.setFilterInvariantSites(this->settings_.drop_invariant_sites); + // Iterate over all available variants in the file or until encountering + // an error. while(true){ - // Retrieve BCF records - if(!bcf_reader.getVariants(this->settings_.checkpoint_n_snps, this->settings_.checkpoint_bases)){ + // Retrieve bcf1_t records using htslib and lazy evaluate them. Stop + // after retrieving a set number of variants or if the interval between + // the smallest and largest variant exceeds some distance in base pairs. + if(this->vcf_container_.GetVariants(this->settings_.checkpoint_n_snps, + this->settings_.checkpoint_bases, + this->vcf_reader_) == false) + { break; } - // Debug assertion -#if IMPORT_ASSERT == 1 - if(bcf_reader.front().body->CHROM == previous_contig_ID){ - if(!(bcf_reader.front().body->POS >= previous_first && bcf_reader.front().body->POS >= previous_last)){ - std::cerr << utility::timestamp("ERROR","IMPORT") << bcf_reader.front().body->POS << '/' << previous_first << '/' << previous_last << std::endl; - std::cerr << bcf_reader[bcf_reader.n_entries].body->POS << std::endl; - exit(1); - } - } -#endif - this->block.header.blockID = this->writer->n_blocks_written; - this->block.header.contigID = bcf_reader.front().body->CHROM; - this->block.header.minPosition = bcf_reader.front().body->POS; - this->block.header.maxPosition = bcf_reader.back().body->POS; // correct only for SNVs - this->block.header.controller.hasGT = this->GT_available_; - this->block.header.controller.hasGTPermuted = this->settings_.permute_genotypes; - // if there is 0 or 1 samples then GT data is never permuted - if(header.getSampleNumber() <= 1) - this->block.header.controller.hasGTPermuted = false; - - // Permute GT if GT is available and the appropriate flag is triggered - if(this->block.header.controller.hasGT && this->block.header.controller.hasGTPermuted){ - if(!this->permutator.build(bcf_reader)){ - std::cerr << utility::timestamp("ERROR","PERMUTE") << "Failed to complete..." << std::endl; - return false; - } - } + // This pointer here is borrowed from the PPA manager + // during import stages. Do not destroy the target block + // before finishing with this. + this->block.gt_ppa = &this->permutator.permutation_array; - //\//////////////////////////////////////////////// - // Start new - // Perform parsing of BCF entries in memory - // Split out RLE compression and importing other INFO/FORMAT - meta_type* meta_entries = static_cast(::operator new[](bcf_reader.size() * sizeof(meta_type))); - - // Load meta data - for(U32 i = 0; i < bcf_reader.size(); ++i){ - new( meta_entries + i ) meta_type( bcf_reader[i], this->block.header.minPosition ); - if(!this->addSite(meta_entries[i], bcf_reader[i])){ - std::cerr << utility::timestamp("ERROR","IMPORT") << "Failed to add BCF entry..." << std::endl; - return false; + if(this->GT_available_ && this->settings_.permute_genotypes){ + // Only store the permutation array if the number of samples + // are greater then one (1). + if(this->vcf_reader_->vcf_header_.GetNumberSamples() > 1){ + if(this->permutator.Build(this->vcf_container_, this->vcf_reader_->vcf_header_) == false) + return false; + + this->block.header.controller.hasGTPermuted = true; } } - // Add genotypes in parallel - this->addGenotypes(bcf_reader, meta_entries); - // Overload - for(U32 i = 0; i < bcf_reader.size(); ++i) this->block += meta_entries[i]; - - // Clean up - for(std::size_t i = 0; i < bcf_reader.size(); ++i) (meta_entries + i)->~MetaEntry(); - ::operator delete[](static_cast(meta_entries)); - //\//////////////////////////////////////////////// - - // Update head meta - this->block.header.controller.hasGT = this->GT_available_; - this->block.header.n_variants = bcf_reader.size(); - this->block.finalize(); - - // Perform compression using standard parameters - if(!this->compression_manager.compress(this->block, this->settings_.compression_level, 6)){ + + if(this->AddRecords(this->vcf_container_) == false) return false; + + this->block.header.controller.hasGT = this->GT_available_; + this->block.header.n_variants = this->vcf_container_.sizeWithoutCarryOver(); + this->block.UpdateContainers(); + this->block.Finalize(); + + // Perform compression using standard parameters. + if(!this->compression_manager.Compress(this->block, this->settings_.compression_level, 6)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to compress..." << std::endl; return false; } - // Checksum have to come before encryption - checksums += this->block; - - // Encryption + // Encrypt the variant block if desired. if(this->settings_.encrypt_data){ + // Generate field-unique identifiers. + this->GenerateIdentifiers(); + + // Start encryption. this->block.header.controller.anyEncrypted = true; if(!encryption_manager.encrypt(this->block, keychain, YON_ENCRYPTION_AES_256_GCM)){ std::cerr << utility::timestamp("ERROR","COMPRESSION") << "Failed to encrypt..." << std::endl; } } - this->index_entry.byte_offset = this->writer->stream->tellp(); - this->block.write(*this->writer->stream, this->stats_basic, this->stats_info, this->stats_format); - - // Compress and write footer - this->block.footer_support.buffer_data_uncompressed << this->block.footer; - this->compression_manager.zstd_codec.compress(this->block.footer_support); - const U64 start_footer_pos = this->writer->stream->tellp(); - const U32 footer_uLength = this->block.footer_support.header.data_header.uLength; - const U32 footer_cLength = this->block.footer_support.header.data_header.cLength; - const U32 footer_crc = this->block.footer_support.header.data_header.crc; - this->writer->stream->write(reinterpret_cast(&footer_uLength), sizeof(U32)); - this->writer->stream->write(reinterpret_cast(&footer_cLength), sizeof(U32)); - this->writer->stream->write(reinterpret_cast(&footer_crc), sizeof(U32)); - *this->writer->stream << this->block.footer_support.buffer_data; - - stats_basic[0].cost_uncompressed += (U64)this->writer->stream->tellp() - start_footer_pos; - - // Write EOB - this->writer->stream->write(reinterpret_cast(&constants::TACHYON_BLOCK_EOF), sizeof(U64)); - - // Update index - this->index_entry.blockID = this->block.header.blockID; - this->index_entry.byte_offset_end = this->writer->stream->tellp(); - this->index_entry.contigID = bcf_reader.front().body->CHROM; - this->index_entry.minPosition = bcf_reader.front().body->POS; - //this->index_entry.maxPosition = bcf_reader.back().body->POS; - this->index_entry.n_variants = bcf_reader.size(); - this->writer->index.index_.linear_at(index_entry.contigID) += this->index_entry; // Todo: beautify - this->index_entry.reset(); + // Write the current variant block. + this->WriteBlock(); - ++this->writer->n_blocks_written; - this->writer->n_variants_written += bcf_reader.size(); - ++this->writer->index.number_blocks; + // Update checksums container with the available data. + checksums += this->block; if(!SILENT){ std::cerr << utility::timestamp("PROGRESS") << std::setfill(' ') << std::setw(10) << this->writer->n_variants_written << ' ' << std::setfill(' ') << std::setw(10) << utility::toPrettyDiskString(this->writer->stream->tellp()) << '\t' << - std::setfill(' ') << std::setw(8) << (double)bcf_reader.stream.tellg()/bcf_reader.filesize*100 << "%" << ' ' << timer.ElapsedString() << ' ' << - header.contigs[bcf_reader.front().body->CHROM].name << ":" << bcf_reader.front().body->POS+1 << "->" << bcf_reader.back().body->POS+1 << std::endl; + this->vcf_reader_->vcf_header_.GetContig(this->vcf_container_.front()->rid)->name << ":" << this->vcf_container_.front()->pos + 1 << "->" << this->vcf_container_.back()->pos + 1 << std::endl; } - // Reset and update + // Clear current data. + this->clear(); this->block.clear(); - this->permutator.reset(); - this->writer->stream->flush(); - previous_contig_ID = bcf_reader.front().body->CHROM; - previous_first = bcf_reader.front().body->POS; - previous_last = bcf_reader.back().body->POS; this->index_entry.reset(); } - // Done importing - this->writer->stream->flush(); + // Do not delete the borrowed pointer. + this->block.gt_ppa = nullptr; - core::Footer footer; - footer.offset_end_of_data = this->writer->stream->tellp(); - footer.n_blocks = this->writer->n_blocks_written; - footer.n_variants = this->writer->n_variants_written; - assert(footer.n_blocks == this->writer->index.size()); - - U64 last_pos = this->writer->stream->tellp(); - this->writer->writeIndex(); // Write index - std::cerr << utility::timestamp("PROGRESS") << "Index size: " << utility::toPrettyDiskString((U64)this->writer->stream->tellp() - last_pos) << "..." << std::endl; - last_pos = this->writer->stream->tellp(); - checksums.finalize(); // Finalize SHA-512 digests - *this->writer->stream << checksums; - std::cerr << utility::timestamp("PROGRESS") << "Checksum size: " << utility::toPrettyDiskString((U64)this->writer->stream->tellp() - last_pos) << "..." << std::endl; - last_pos = this->writer->stream->tellp(); - *this->writer->stream << footer; - std::cerr << utility::timestamp("PROGRESS") << "Footer size: " << utility::toPrettyDiskString((U64)this->writer->stream->tellp() - last_pos) << "..." << std::endl; + // Finalize writing procedure. + this->WriteFinal(checksums); + this->WriteKeychain(keychain); - this->writer->stream->flush(); + // All done + return(true); +} - std::vector usage_statistics_names = { - "FooterHeader","GT-PPA","MetaContig","MetaPositions","MetaRefAlt","MetaController","MetaQuality","MetaNames", - "MetaAlleles","MetaInfoMaps","MetaFormatMaps","MetaFilterMaps","GT-Support", - "GT-RLE8","GT-RLE16","GT-RLE32","GT-RLE64", - "GT-Simple8","GT-Simple16","GT-Simple32","GT-Simple64","INFO-ALL","FORMAT-ALL"}; +bool VariantImporter::AddRecords(const vcf_container_type& container){ + meta_type* meta_entries = static_cast(::operator new[](container.sizeWithoutCarryOver() * sizeof(meta_type))); + for(U32 i = 0; i < container.sizeWithoutCarryOver(); ++i){ + // Transmute a bcf record into a meta structure + new( meta_entries + i ) meta_type( this->vcf_container_[i], this->block.header.minPosition ); - U64 total_uncompressed = 0; U64 total_compressed = 0; - for(U32 i = 0; i < usage_statistics_names.size(); ++i){ - total_uncompressed += this->stats_basic[i].cost_uncompressed; - total_compressed += this->stats_basic[i].cost_compressed; + // Add the record data + if(this->AddRecord(container, i, meta_entries[i]) == false) + return false; } - // If we are writing to a file - if(this->settings_.output_prefix.size()){ - std::ofstream writer_stats; - writer_file_type* wstats = reinterpret_cast(this->writer); - writer_stats.open(wstats->basePath + wstats->baseName + "_yon_stats.txt", std::ios::out); + // Add FORMAT:GT field data. + this->encoder.Encode(container, meta_entries, this->block, this->permutator.permutation_array); - if(!SILENT) - std::cerr << utility::timestamp("LOG") << "Writing statistics to: " << (wstats->basePath + wstats->baseName) << "_yon_stats.txt" << std::endl; + // Add meta records to the block buffers + for(U32 i = 0; i < container.sizeWithoutCarryOver(); ++i) this->block += meta_entries[i]; - if(writer_stats.good()){ - for(U32 i = 0; i < usage_statistics_names.size(); ++i) writer_stats << usage_statistics_names[i] << '\t' << this->stats_basic[i] << std::endl; - for(U32 i = 0; i < header.header_magic.n_info_values; ++i) writer_stats << "INFO_" << header.info_fields[i].ID << '\t' << this->stats_info[i] << std::endl; - for(U32 i = 0; i < header.header_magic.n_format_values; ++i) writer_stats << "FORMAT_" << header.format_fields[i].ID << '\t' << this->stats_format[i] << std::endl; + for(std::size_t i = 0; i < container.sizeWithoutCarryOver(); ++i) (meta_entries + i)->~MetaEntry(); + ::operator delete[](static_cast(meta_entries)); - writer_stats << "BCF\t" << bcf_reader.filesize << "\t" << bcf_reader.b_data_read << '\t' << (float)bcf_reader.b_data_read/bcf_reader.filesize << std::endl; - writer_stats << "YON\t" << this->writer->stream->tellp() << "\t" << total_uncompressed << '\t' << (float)bcf_reader.b_data_read/this->writer->stream->tellp() << std::endl; - writer_stats.close(); - } else { - std::cerr << utility::timestamp("ERROR", "SUPPORT") << "Failed to open: " << (wstats->basePath + wstats->baseName + "_yon_stats.txt") << "... Continuing..." << std::endl; - } + return true; +} + +bool VariantImporter::AddRecord(const vcf_container_type& container, const U32 position, meta_type& meta){ + if(container.at(position)->pos > this->vcf_reader_->vcf_header_.GetContig(container.at(position)->rid)->n_bases){ + std::cerr << utility::timestamp("ERROR", "IMPORT") << this->vcf_reader_->vcf_header_.GetContig(container.at(position)->rid)->name << ':' << container.at(position)->pos + 1 << " > reported max size of contig (" << this->vcf_reader_->vcf_header_.GetContig(container.at(position)->rid)->n_bases + 1 << ")..." << std::endl; + return false; } - const algorithm::GenotypeEncoderStatistics& gt_stats = this->encoder.getUsageStats(); - const U64 n_total_gt = gt_stats.getTotal(); - if(!SILENT){ - std::cout << "GT-RLE-8\t" << gt_stats.rle_counts[0] << '\t' << (float)gt_stats.rle_counts[0]/n_total_gt << std::endl; - std::cout << "GT-RLE-16\t" << gt_stats.rle_counts[1] << '\t' << (float)gt_stats.rle_counts[1]/n_total_gt << std::endl; - std::cout << "GT-RLE-32\t" << gt_stats.rle_counts[2] << '\t' << (float)gt_stats.rle_counts[2]/n_total_gt << std::endl; - std::cout << "GT-RLE-64\t" << gt_stats.rle_counts[3] << '\t' << (float)gt_stats.rle_counts[3]/n_total_gt << std::endl; - std::cout << "GT-RLES-8\t" << gt_stats.rle_simple_counts[0] << '\t' << (float)gt_stats.rle_simple_counts[0]/n_total_gt << std::endl; - std::cout << "GT-RLES-16\t" << gt_stats.rle_simple_counts[1] << '\t' << (float)gt_stats.rle_simple_counts[1]/n_total_gt << std::endl; - std::cout << "GT-RLES-32\t" << gt_stats.rle_simple_counts[2] << '\t' << (float)gt_stats.rle_simple_counts[2]/n_total_gt << std::endl; - std::cout << "GT-RLES-64\t" << gt_stats.rle_simple_counts[3] << '\t' << (float)gt_stats.rle_simple_counts[3]/n_total_gt << std::endl; - std::cout << "GT-DIPLOID-BCF-8\t" << gt_stats.diploid_bcf_counts[0] << '\t' << (float)gt_stats.diploid_bcf_counts[0]/n_total_gt << std::endl; - std::cout << "GT-DIPLOID-BCF-16\t" << gt_stats.diploid_bcf_counts[1] << '\t' << (float)gt_stats.diploid_bcf_counts[1]/n_total_gt << std::endl; - std::cout << "GT-DIPLOID-BCF-32\t" << gt_stats.diploid_bcf_counts[2] << '\t' << (float)gt_stats.diploid_bcf_counts[2]/n_total_gt << std::endl; - std::cout << "GT-BCF-8\t" << gt_stats.bcf_counts[0] << '\t' << (float)gt_stats.bcf_counts[0]/n_total_gt << std::endl; - std::cout << "GT-BCF-16\t" << gt_stats.bcf_counts[1] << '\t' << (float)gt_stats.bcf_counts[1]/n_total_gt << std::endl; - std::cout << "GT-BCF-32\t" << gt_stats.bcf_counts[2] << '\t' << (float)gt_stats.bcf_counts[2]/n_total_gt << std::endl; - std::cerr << utility::timestamp("PROGRESS") << "Wrote: " << utility::ToPrettyString(this->writer->n_variants_written) << " variants in " << utility::ToPrettyString(this->writer->n_blocks_written) << " blocks in " << timer.ElapsedString() << " to " << utility::toPrettyDiskString((U64)this->writer->stream->tellp()) << std::endl; + if(this->AddVcfFilterInfo(container.at(position), meta) == false) return false; + if(this->AddVcfInfo(container.at(position), meta) == false) return false; + + if(container.at(position)->n_fmt){ + if(this->AddVcfFormatInfo(container.at(position), meta) == false) return false; + + // Perform these actions if FORMAT:GT data is available. + const int& hts_format_key = container.at(position)->d.fmt[0].id; // htslib IDX value + if(this->vcf_reader_->vcf_header_.GetFormat(hts_format_key)->id != "GT"){ + meta.controller.gt_available = false; + } else + meta.controller.gt_available = true; } - // Write keychain - if(this->settings_.encrypt_data){ - if(this->settings_.output_prefix.size()){ - std::ofstream writer_keychain; - writer_file_type* wstats = reinterpret_cast(this->writer); - writer_keychain.open(wstats->basePath + wstats->baseName + ".kyon", std::ios::out); - if(!SILENT) - std::cerr << utility::timestamp("LOG") << "Writing encryption keychain to: " << (wstats->basePath + wstats->baseName) << ".kyon" << std::endl; + if(this->IndexRecord(container.at(position), meta) == false) return false; + return true; +} - if(writer_keychain.good()){ - //writer_keychain.write(keybuffer.data(), keybuffer.size()); - writer_keychain << keychain; - writer_keychain.flush(); - } - const U32 keychain_size = writer_keychain.tellp(); - writer_keychain.close(); - if(!SILENT) - std::cerr << utility::timestamp("LOG") << "Wrote keychain with " << utility::ToPrettyString(keychain.size()) << " keys to " << utility::toPrettyDiskString(keychain_size) << "..." << std::endl; - } +bool VariantImporter::AddVcfFilterInfo(const bcf1_t* record, meta_type& meta){ + // Add FILTER id list to the block. Filter information is unique in that the + // data is not stored as (key,value)-tuples but as a key id. Because no data + // is stored in the block, only the unique vectors of ids and their unique + // ids are collected here. These keys are used to construct bit-vectors for + // set-membership tests. + std::vector filter_ids; + const int& n_filter_fields = record->d.n_flt; + for(U32 i = 0; i < n_filter_fields; ++i){ + const int& hts_filter_key = record->d.flt[i]; // htslib IDX value + const U32 global_key = this->filter_reorder_map_[hts_filter_key]; // tachyon global IDX value + const U32 target_container = this->block.AddFilter(global_key); + assert(target_container < 65536); + filter_ids.push_back(global_key); } - // temp - // bins in contig - /* - for(U32 i = 0; i < this->writer->index.variant_index_.at(19).size(); ++i){ - std::cout << i << '\t' << this->writer->index.variant_index_.at(19).at(i).blockID << '\t' << this->writer->index.variant_index_.at(19).at(i).n_variants_ << '\t'; - // hits in bin - for(U32 j = 0; j < this->writer->index.variant_index_.at(19).at(i).size(); ++j) - std::cout << ',' << this->writer->index.variant_index_.at(19).at(i).at(j); - std::cout << std::endl; + return(this->AddVcfFilterPattern(filter_ids, meta)); +} + +bool VariantImporter::AddVcfInfo(const bcf1_t* record, meta_type& meta){ + // Add INFO fields to the block + std::vector info_ids; + const int n_info_fields = record->n_info; + for(U32 i = 0; i < n_info_fields; ++i){ + const int& hts_info_key = record->d.info[i].key; // htslib IDX value + const U32 global_key = this->info_reorder_map_[hts_info_key]; // tachyon global IDX value + const U32 target_container = this->block.AddInfo(global_key); + assert(target_container < 65536); + info_ids.push_back(global_key); + + stream_container& destination_container = this->block.info_containers[target_container]; + const int& info_primitive_type = record->d.info[i].type; + const int& stride_size = record->d.info[i].len; + const uint32_t& data_length = record->d.info[i].vptr_len; + const uint8_t* data = record->d.info[i].vptr; + int element_stride_size = 0; + + if(info_primitive_type == BCF_BT_INT8){ + element_stride_size = sizeof(int8_t); + assert(data_length % element_stride_size == 0); + const SBYTE* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + } else if(info_primitive_type == BCF_BT_INT16){ + element_stride_size = sizeof(int16_t); + assert(data_length % element_stride_size == 0); + const S16* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + } else if(info_primitive_type == BCF_BT_INT32){ + element_stride_size = sizeof(int32_t); + assert(data_length % element_stride_size == 0); + const S32* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + } else if(info_primitive_type == BCF_BT_FLOAT){ + element_stride_size = sizeof(float); + assert(data_length % element_stride_size == 0); + const float* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + } else if(info_primitive_type == BCF_BT_CHAR){ + element_stride_size = sizeof(char); + const char* data_local = reinterpret_cast(data); + destination_container.AddCharacter(data_local, data_length); + } else if(info_primitive_type == BCF_BT_NULL){ + element_stride_size = 0; + assert(data_length == 0 && stride_size == 0); + } else { + std::cerr << utility::timestamp("ERROR","VCF") << "Unknown case: " << (int)info_primitive_type << std::endl; + exit(1); + } + + ++destination_container; + destination_container.AddStride(stride_size); } - */ - // All done - return(true); + return(this->AddVcfInfoPattern(info_ids, meta)); } -bool VariantImporter::addGenotypes(bcf_reader_type& bcf_reader, meta_type* meta_entries){ - /* - for(U32 i = 0; i < bcf_reader.size(); ++i){ - if(bcf_reader[i].hasGenotypes){ - meta_entries[i].controller.gt_available = true; +bool VariantImporter::AddVcfFormatInfo(const bcf1_t* record, meta_type& meta){ + // Add FORMAT fields to the block + std::vector format_ids; + const int n_format_fields = record->n_fmt; + for(U32 i = 0; i < n_format_fields; ++i){ + const int& hts_format_key = record->d.fmt[i].id;; // htslib IDX value + const U32 global_key = this->format_reorder_map_[hts_format_key]; // tachyon global IDX value + const U32 target_container = this->block.AddFormat(global_key); + assert(target_container < 65536); + format_ids.push_back(global_key); + + // Genotypes are a special case and are treated completely differently. + // Because of this we simply skip that data here if it is available. + if(this->vcf_reader_->vcf_header_.GetFormat(hts_format_key)->id == "GT"){ + continue; + } - if(!this->encoder.Encode(bcf_reader[i], meta_entries[i], this->block, this->permutator.manager->get())){ - std::cerr << utility::timestamp("ERROR","ENCODER") << "Failed to encode GT..." << std::endl; - return false; - } + stream_container& destination_container = this->block.format_containers[target_container]; + + const int& format_primitive_type = record->d.fmt[i].type; + const int& stride_size = record->d.fmt[i].n; + const uint32_t& data_length = record->d.fmt[i].p_len; + const uint8_t* data = record->d.fmt[i].p; + int element_stride_size = 0; + + if(format_primitive_type == BCF_BT_INT8){ + element_stride_size = sizeof(int8_t); + assert(data_length % element_stride_size == 0); + const SBYTE* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + assert(stride_size * element_stride_size * this->vcf_reader_->vcf_header_.GetNumberSamples() == data_length); + } else if(format_primitive_type == BCF_BT_INT16){ + element_stride_size = sizeof(int16_t); + assert(data_length % element_stride_size == 0); + const S16* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + assert(stride_size * element_stride_size * this->vcf_reader_->vcf_header_.GetNumberSamples() == data_length); + } else if(format_primitive_type == BCF_BT_INT32){ + element_stride_size = sizeof(int32_t); + assert(data_length % element_stride_size == 0); + const S32* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + assert(stride_size * element_stride_size * this->vcf_reader_->vcf_header_.GetNumberSamples() == data_length); + } else if(format_primitive_type == BCF_BT_FLOAT){ + element_stride_size = sizeof(float); + assert(data_length % element_stride_size == 0); + const float* data_local = reinterpret_cast(data); + for(U32 j = 0; j < data_length/element_stride_size; ++j) + destination_container.Add(data_local[j]); + assert(stride_size * element_stride_size * this->vcf_reader_->vcf_header_.GetNumberSamples() == data_length); + } else if(format_primitive_type == BCF_BT_CHAR){ + element_stride_size = sizeof(char); + const char* data_local = reinterpret_cast(data); + destination_container.AddCharacter(data_local, data_length); } else { - meta_entries[i].controller.gt_available = false; + std::cerr << utility::timestamp("ERROR","VCF") << "Unknown case: " << (int)format_primitive_type << std::endl; + exit(1); } + + ++destination_container; + destination_container.AddStride(stride_size); } - */ - this->encoder.EncodeParallel(bcf_reader, meta_entries, this->block, this->permutator.manager->get(), this->settings_.n_threads); - return true; + return(this->AddVcfFormatPattern(format_ids, meta)); } -bool VariantImporter::addSite(meta_type& meta, bcf_entry_type& entry){ - // Assert position is in range - if(entry.body->POS + 1 > this->header->getContig(entry.body->CHROM).bp_length){ - std::cerr << utility::timestamp("ERROR", "IMPORT") << this->header->getContig(entry.body->CHROM).name << ':' << entry.body->POS+1 << " > reported max size of contig (" << this->header->getContig(entry.body->CHROM).bp_length << ")..." << std::endl; - return false; +bool VariantImporter::AddVcfInfoPattern(const std::vector& pattern, meta_type& meta){ + if(pattern.size()){ + meta.info_pattern_id = this->block.AddInfoPattern(pattern); } + return true; +} - if(!this->parseBCFBody(meta, entry)){ - std::cerr << utility::timestamp("ERROR","ENCODER") << "Failed to encode BCF body..." << std::endl; - return false; +bool VariantImporter::AddVcfFormatPattern(const std::vector& pattern, meta_type& meta){ + if(pattern.size()){ + meta.format_pattern_id = this->block.AddFormatPattern(pattern); } + return true; +} - // Add meta - //this->block += meta; +bool VariantImporter::AddVcfFilterPattern(const std::vector& pattern, meta_type& meta){ + if(pattern.size()){ + meta.filter_pattern_id = this->block.AddFilterPattern(pattern); + } + return true; +} - // Has END position? +bool VariantImporter::IndexRecord(const bcf1_t* record, const meta_type& meta){ S32 index_bin = -1; - S64 end_position_used = entry.body->POS; + // Ascertain that the meta entry has been evaluated + // prior to executing this function. + if(meta.n_alleles == 0){ + std::cerr << utility::timestamp("ERROR","IMPORT") << "The target meta record must be parsed prior to executing indexing functions..." << std::endl; + return false; + } + + S64 end_position_used = record->pos; + + // The Info field END is used as the end position of an internal if it is available. This field + // is usually only set for non-standard variants such as SVs or other special meaning records. if(this->settings_.info_end_key != -1){ - // Linear search: this is not optimal but probably still faster - // than generating a new hash table for each record - for(U32 i = 0; i < entry.infoPointer; ++i){ - if(entry.infoID[i].mapID == this->settings_.info_end_key){ - const U32 end = entry.getInteger(entry.infoID[i].primitive_type, entry.infoID[i].l_offset); - end_position_used = end; - //std::cerr << "Found END at " << i << ". END=" << end << " POS=" << entry.body->POS+1 << " BIN=" << reg2bin(entry.body->POS, end) << std::endl; - index_bin = this->writer->index.index_[meta.contigID].Add(entry.body->POS, end, (U32)this->writer->index.current_block_number()); + // Linear search for the END key: this is not optimal but is probably faster + // than first constructing a hash table for each record. + const int n_info_fields = record->n_info; + for(U32 i = 0; i < n_info_fields; ++i){ + if(record->d.info[i].key == this->settings_.info_end_key){ + U32 end = 0; + switch(record->d.info[i].type){ + case(BCF_BT_INT8): end = *reinterpret_cast (record->d.info[i].vptr); break; + case(BCF_BT_INT16): end = *reinterpret_cast(record->d.info[i].vptr); break; + case(BCF_BT_INT32): end = *reinterpret_cast(record->d.info[i].vptr); break; + default: + std::cerr << "Illegal END primitive type: " << io::BCF_TYPE_LOOKUP[record->d.info[i].type] << std::endl; + return false; + } + //std::cerr << "Found END at " << i << ". END=" << end << " POS=" << record->pos + 1 << std::endl; + index_bin = this->writer->index.index_[meta.contigID].add(record->pos, end, (U32)this->writer->index.current_block_number()); + //index_bin = 0; break; } } } + // If the END field cannot be found then we check if the variant is a if(index_bin == -1){ S32 longest = -1; + // Iterate over available allele information and find the longest + // SNV/indel length. The regex pattern ^[ATGC]{1,}$ searches for + // simple SNV/indels. for(U32 i = 0; i < meta.n_alleles; ++i){ - // Regex pattern ^[ATGC]{1,}$ - std::regex txt_regex("^[ATGC]{1,}$"); - if(std::regex_match(meta.alleles[i].toString(), txt_regex)){ - if(meta.alleles[i].l_allele > longest) longest = meta.alleles[i].l_allele; - } else { - //std::cerr << "POS=" << entry.body->POS+1 << " no regex match: " << meta.alleles[i].toString() << std::endl; + if(std::regex_match(meta.alleles[i].allele, utility::YON_VARIANT_STANDARD)){ + if(meta.alleles[i].l_allele > longest) + longest = meta.alleles[i].l_allele; } } + // Update the variant index with the target bin(s) found. if(longest > 1){ - index_bin = this->writer->index.index_[meta.contigID].Add(entry.body->POS, entry.body->POS + longest, (U32)this->writer->index.current_block_number()); - end_position_used = entry.body->POS + longest; - } else { // fallback if all others fail - index_bin = this->writer->index.index_[meta.contigID].Add(entry.body->POS, entry.body->POS, (U32)this->writer->index.current_block_number()); + index_bin = this->writer->index.index_[meta.contigID].add(record->pos, + record->pos + longest, + (U32)this->writer->index.current_block_number()); + //index_bin = 0; + end_position_used = record->pos + longest; } + // In the cases of special-meaning alleles such as copy-number (e.g. ) + // or SV (e.g. A[B)) they are index according to their left-most value only. + // This has the implication that they cannot be found by means of interval + // intersection searches. If special-meaning variants were to be supproted + // in the index then many more blocks would have to be searched for each + // query as the few will dominate the many. + else { + index_bin = this->writer->index.index_[meta.contigID].add(record->pos, record->pos, (U32)this->writer->index.current_block_number()); + //index_bin = 0; + //std::cerr << "fallback: " << record->pos+1 << std::endl; + } + + //std::cerr << "End pos used: " << end_position_used << std::endl; } if(index_bin > this->index_entry.maxBin) this->index_entry.maxBin = index_bin; if(index_bin < this->index_entry.minBin) this->index_entry.minBin = index_bin; @@ -496,147 +503,113 @@ bool VariantImporter::addSite(meta_type& meta, bcf_entry_type& entry){ return true; } -bool VariantImporter::parseBCFBody(meta_type& meta, bcf_entry_type& entry){ - for(U32 i = 0; i < entry.filterPointer; ++i){ - assert(entry.filterID[i].mapID != -1); - this->block.addFieldFILTER(this->header->filter_remap[entry.filterID[i].mapID]); - } - - for(U32 i = 0; i < entry.infoPointer; ++i){ - assert(entry.infoID[i].mapID != -1); - const U32 mapID = this->block.addFieldINFO(this->header->info_remap[entry.infoID[i].mapID]); +bool VariantImporter::UpdateIndex(){ + this->index_entry.blockID = this->writer->n_blocks_written; + this->index_entry.byte_offset_end = this->writer->stream->tellp(); + this->index_entry.contigID = this->vcf_container_.front()->rid; + this->index_entry.minPosition = this->vcf_container_.front()->pos; + this->index_entry.n_variants = this->vcf_container_.sizeWithoutCarryOver(); + this->writer->index.index_.linear_at(index_entry.contigID) += this->index_entry; + this->index_entry.reset(); + ++this->writer->n_blocks_written; + this->writer->n_variants_written += this->vcf_container_.sizeWithoutCarryOver(); + ++this->writer->index.number_blocks; - stream_container& target_container = this->block.info_containers[mapID]; + return true; +} - // Flags and integers - // These are BCF value types - U32 internal_pos = entry.infoID[i].l_offset; - if(entry.infoID[i].primitive_type <= 3){ - for(U32 j = 0; j < entry.infoID[i].l_stride; ++j){ - target_container.Add(entry.getInteger(entry.infoID[i].primitive_type, internal_pos)); - } - } - // Floats - else if(entry.infoID[i].primitive_type == bcf::BCF_FLOAT){ - for(U32 j = 0; j < entry.infoID[i].l_stride; ++j){ - target_container.Add(entry.getFloat(internal_pos)); - } - } - // Chars - else if(entry.infoID[i].primitive_type == bcf::BCF_CHAR){ - target_container.AddCharacter(entry.getCharPointer(internal_pos), entry.infoID[i].l_stride); - internal_pos += entry.infoID[i].l_stride; - } - // Illegal: parsing error - else { - std::cerr << "impossible in info: " << (int)entry.infoID[i].primitive_type << std::endl; - exit(1); - } +bool VariantImporter::WriteBlock(){ + this->index_entry.byte_offset = this->writer->stream->tellp(); + this->block.write(*this->writer->stream, this->stats_basic, this->stats_info, this->stats_format); + + // After all compression and writing is finished the header + // offsets are themselves compressed and stored in the block. + this->block.PackFooter(); // Pack footer into buffer. + this->compression_manager.zstd_codec.Compress(block.footer_support); + this->writer->WriteBlockFooter(this->block.footer_support); + this->writer->WriteEndOfBlock(); // End-of-block marker + this->UpdateIndex(); // Update index. + return(this->writer->stream->good()); +} - ++target_container; - target_container.addStride(entry.infoID[i].l_stride); - } +bool VariantImporter::WriteFinal(algorithm::VariantDigestManager& checksums){ + // Done importing + this->writer->stream->flush(); - for(U32 i = 0; i < entry.formatPointer; ++i){ - assert(entry.formatID[i].mapID != -1); + core::Footer footer; + footer.offset_end_of_data = this->writer->stream->tellp(); + footer.n_blocks = this->writer->n_blocks_written; + footer.n_variants = this->writer->n_variants_written; + assert(footer.n_blocks == this->writer->index.GetLinearSize()); - //const U32 mapID = this->block.format_fields.setGet(this->header->format_remap[entry.formatID[i].mapID]); - const U32 mapID = this->block.addFieldFORMAT(this->header->format_remap[entry.formatID[i].mapID]); - U32 internal_pos = entry.formatID[i].l_offset; + U64 last_pos = this->writer->stream->tellp(); + this->writer->writeIndex(); // Write index + std::cerr << utility::timestamp("PROGRESS") << "Index size: " << utility::toPrettyDiskString((U64)this->writer->stream->tellp() - last_pos) << "..." << std::endl; + last_pos = this->writer->stream->tellp(); + checksums.finalize(); // Finalize SHA-512 digests + *this->writer->stream << checksums; + std::cerr << utility::timestamp("PROGRESS") << "Checksum size: " << utility::toPrettyDiskString((U64)this->writer->stream->tellp() - last_pos) << "..." << std::endl; + last_pos = this->writer->stream->tellp(); + *this->writer->stream << footer; + std::cerr << utility::timestamp("PROGRESS") << "Footer size: " << utility::toPrettyDiskString((U64)this->writer->stream->tellp() - last_pos) << "..." << std::endl; - // First value is always genotypes if there are any - if(entry.hasGenotypes == true && i == 0) - continue; + this->writer->stream->flush(); + return(this->writer->stream->good()); +} - // Hash INFO values - stream_container& target_container = this->block.format_containers[mapID]; +bool VariantImporter::WriteKeychain(const encryption::Keychain<>& keychain){ + // Write keychain + if(this->settings_.encrypt_data){ + if(this->settings_.output_prefix.size()){ + std::ofstream writer_keychain; + writer_file_type* wstats = reinterpret_cast(this->writer); + writer_keychain.open(wstats->basePath + wstats->baseName + ".kyon", std::ios::out); + if(!SILENT) + std::cerr << utility::timestamp("LOG") << "Writing encryption keychain to: " << (wstats->basePath + wstats->baseName) << ".kyon" << std::endl; - // Flags and integers - // These are BCF value types - if(entry.formatID[i].primitive_type <= 3){ - for(U32 s = 0; s < this->header->samples; ++s){ - for(U32 j = 0; j < entry.formatID[i].l_stride; ++j) - target_container.Add(entry.getInteger(entry.formatID[i].primitive_type, internal_pos)); - } - } - // Floats - else if(entry.formatID[i].primitive_type == bcf::BCF_FLOAT){ - for(U32 s = 0; s < this->header->samples; ++s){ - for(U32 j = 0; j < entry.formatID[i].l_stride; ++j) - target_container.Add(entry.getFloat(internal_pos)); - } - } - // Chars - else if(entry.formatID[i].primitive_type == bcf::BCF_CHAR){ - for(U32 s = 0; s < this->header->samples; ++s){ - //for(U32 j = 0; j < entry.formatID[i].l_stride; ++j) - target_container.AddCharacter(entry.getCharPointer(internal_pos), entry.formatID[i].l_stride); - internal_pos += entry.formatID[i].l_stride; + if(writer_keychain.good()){ + //writer_keychain.write(keybuffer.data(), keybuffer.size()); + writer_keychain << keychain; + writer_keychain.flush(); } + const U32 keychain_size = writer_keychain.tellp(); + writer_keychain.close(); + if(!SILENT) + std::cerr << utility::timestamp("LOG") << "Wrote keychain with " << utility::ToPrettyString(keychain.size()) << " keys to " << utility::toPrettyDiskString(keychain_size) << "..." << std::endl; } - // Illegal: parsing error - else { - std::cerr << "impossible: " << (int)entry.formatID[i].primitive_type << std::endl; - std::cerr << utility::timestamp("LOG") << entry.formatID[i].mapID << '\t' << entry.formatID[i].l_stride << '\t' << (int)entry.formatID[i].primitive_type << '\t' << internal_pos << '/' << entry.l_data << std::endl; - exit(1); - } - - ++target_container; - target_container.addStride(entry.formatID[i].l_stride); - } - - if(entry.filterPointer){ - // Hash FILTER pattern - const U64 hash_filter_vector = entry.hashFilter(); - - S32 mapID = this->block.getPatternsFILTER(hash_filter_vector); - if(mapID == -1){ - std::vector ret_pattern; - for(U32 i = 0; i < entry.filterPointer; ++i) - ret_pattern.push_back(this->header->filter_remap[entry.filterID[i].mapID]); - - mapID = this->block.filter_patterns.size(); - assert(mapID < 65536); - this->block.addPatternFILTER(ret_pattern, hash_filter_vector); - } - meta.filter_pattern_id = mapID; - } - - if(entry.infoPointer){ - // Hash INFO pattern - const U64 hash_info_vector = entry.hashInfo(); - - S32 mapID = this->block.getPatternsINFO(hash_info_vector); - if(mapID == -1){ - std::vector ret_pattern; - for(U32 i = 0; i < entry.infoPointer; ++i) - ret_pattern.push_back(this->header->info_remap[entry.infoID[i].mapID]); - - mapID = this->block.info_patterns.size(); - assert(mapID < 65536); - this->block.addPatternINFO(ret_pattern, hash_info_vector); - } - meta.info_pattern_id = mapID; } + return true; +} - if(entry.formatPointer){ - // Hash FORMAT pattern - const U64 hash_format_vector = entry.hashFormat(); - - S32 mapID = this->block.getPatternsFORMAT(hash_format_vector); - if(mapID == -1){ - std::vector ret_pattern; - for(U32 i = 0; i < entry.formatPointer; ++i) - ret_pattern.push_back(this->header->format_remap[entry.formatID[i].mapID]); +bool VariantImporter::WriteYonHeader(){ + VariantHeader yon_header(this->vcf_reader_->vcf_header_); + + io::BasicBuffer temp(500000); + io::BasicBuffer temp_cmp(temp); + temp << yon_header; + this->compression_manager.zstd_codec.Compress(temp, temp_cmp, 20); + uint32_t l_data = temp.size(); + uint32_t l_c_data = temp_cmp.size(); + utility::SerializePrimitive(l_data, *this->writer->stream); + utility::SerializePrimitive(l_c_data, *this->writer->stream); + this->writer->stream->write(temp_cmp.data(), l_c_data); + return(this->writer->stream->good()); +} - mapID = this->block.format_patterns.size(); - assert(mapID < 65536); - this->block.addPatternFORMAT(ret_pattern, hash_format_vector); +bool VariantImporter::GenerateIdentifiers(void){ + BYTE RANDOM_BYTES[32]; + for(U32 i = 0; i < this->vcf_container_.sizeWithoutCarryOver(); ++i){ + U64 b_hash; + while(true){ + RAND_bytes(&RANDOM_BYTES[0], 32); + b_hash = XXH64(&RANDOM_BYTES[0], 32, 1337); + hash_map_type::const_iterator it = this->block_hash_map.find(b_hash); + if(it == this->block_hash_map.end()){ + this->block_hash_map[b_hash] = 0; // Number doesn't matter. + break; + } } - meta.format_pattern_id = mapID; } - - // Return return true; } diff --git a/lib/variant_importer.h b/lib/variant_importer.h index 928e674..bb2293a 100644 --- a/lib/variant_importer.h +++ b/lib/variant_importer.h @@ -1,18 +1,23 @@ #ifndef VARIANT_IMPORTER_H_ #define VARIANT_IMPORTER_H_ +#include + #include "algorithm/compression/compression_manager.h" #include "algorithm/compression/genotype_encoder.h" -#include "algorithm/permutation/radix_sort_gt.h" #include "algorithm/timer.h" #include "containers/variant_block.h" #include "core/variant_import_writer.h" #include "core/variant_importer_container_stats.h" #include "index/index_entry.h" #include "index/index_index_entry.h" -#include "io/bcf/BCFReader.h" +#include "io/vcf_utils.h" #include "support/helpers.h" #include "support/type_definitions.h" +#include "algorithm/digest/variant_digest_manager.h" +#include "core/footer/footer.h" +#include "algorithm/encryption/encryption_decorator.h" +#include "algorithm/permutation/genotype_sorter.h" namespace tachyon { @@ -35,7 +40,7 @@ struct VariantImporterSettings{ ~VariantImporterSettings() = default; - std::string getInterpretedString(void) const{ + std::string GetInterpretedString(void) const{ return(std::string("##tachyon_importInterpretedCommand=input_file=" + this->input_file + ";output_prefix=" + this->output_prefix + ";checkpoint_snps=" + std::to_string(this->checkpoint_n_snps) + @@ -44,12 +49,12 @@ struct VariantImporterSettings{ )); } - inline void setInputFile(const std::string& input_name){ this->input_file = input_name; } - inline void setOutputPrefix(const std::string& output_prefix){ this->output_prefix = output_prefix; } - inline void setThreads(const U32 n_threads){ this->n_threads = n_threads; } - inline void setPermute(const bool yes){ this->permute_genotypes = yes; } - inline void setEncrypt(const bool yes){ this->encrypt_data = yes; } - inline void setCompressionLevel(const U32 compression_level){ this->compression_level = compression_level; } + inline void SetInputFile(const std::string& input_name){ this->input_file = input_name; } + inline void SetOutputPrefix(const std::string& output_prefix){ this->output_prefix = output_prefix; } + inline void SetThreads(const U32 n_threads){ this->n_threads = n_threads; } + inline void SetPermute(const bool yes){ this->permute_genotypes = yes; } + inline void SetEncrypt(const bool yes){ this->encrypt_data = yes; } + inline void SetCompressionLevel(const U32 compression_level){ this->compression_level = compression_level; } public: bool permute_genotypes; // permute GT flag @@ -72,38 +77,53 @@ class VariantImporter { typedef VariantImportWriterFile writer_file_type; typedef VariantImportWriterStream writer_stream_type; typedef io::BasicBuffer buffer_type; - typedef vcf::VCFHeader header_type; + typedef index::IndexEntry index_entry_type; - typedef bcf::BCFReader bcf_reader_type; - typedef bcf::BCFEntry bcf_entry_type; + + typedef io::VcfReader vcf_reader_type; + typedef containers::VcfContainer vcf_container_type; + typedef algorithm::CompressionManager compression_manager_type; - typedef algorithm::RadixSortGT radix_sorter_type; - typedef algorithm::PermutationManager permutation_type; + typedef algorithm::GenotypeSorter radix_sorter_type; typedef algorithm::GenotypeEncoder gt_encoder_type; typedef containers::DataContainer stream_container; - typedef containers::HashContainer hash_container_type; - typedef containers::HashVectorContainer hash_vector_container_type; typedef containers::VariantBlock block_type; typedef support::VariantImporterContainerStats import_stats_type; typedef core::MetaEntry meta_type; typedef VariantImporterSettings settings_type; + typedef std::unordered_map reorder_map_type; + typedef std::unordered_map hash_map_type; public: VariantImporter(); VariantImporter(const settings_type& settings); - - VariantImporter(std::string inputFile, std::string outputPrefix, const U32 checkpoint_size, const double checkpoint_bases); ~VariantImporter(); + bool Build(); - void setWriterTypeFile(void){ this->writer = new writer_file_type; } - void setWriterTypeStream(void){ this->writer = new writer_stream_type; } + void SetWriterTypeFile(void){ this->writer = new writer_file_type; } + void SetWriterTypeStream(void){ this->writer = new writer_stream_type; } + + void clear(void); private: - bool BuildBCF(); // import a BCF file - bool addSite(meta_type& meta, bcf_entry_type& line); // Import a BCF line - bool addGenotypes(bcf_reader_type& bcf_reader, meta_type* meta_entries); - bool parseBCFBody(meta_type& meta, bcf_entry_type& line); + bool BuildVCF(); + bool AddRecords(const vcf_container_type& container); + bool AddRecord(const vcf_container_type& container, const U32 position, meta_type& meta); + bool AddVcfInfo(const bcf1_t* record, meta_type& meta); + bool AddVcfFormatInfo(const bcf1_t* record, meta_type& meta); + bool AddVcfFilterInfo(const bcf1_t* record, meta_type& meta); + bool IndexRecord(const bcf1_t* record, const meta_type& meta); + bool AddVcfInfoPattern(const std::vector& pattern, meta_type& meta); + bool AddVcfFormatPattern(const std::vector& pattern, meta_type& meta); + bool AddVcfFilterPattern(const std::vector& pattern, meta_type& meta); + bool AddGenotypes(const vcf_container_type& container, meta_type* meta_entries); + bool UpdateIndex(); + bool WriteBlock(); + bool WriteFinal(algorithm::VariantDigestManager& checksums); + bool WriteKeychain(const encryption::Keychain<>& keychain); + bool WriteYonHeader(); + bool GenerateIdentifiers(void); private: settings_type settings_; // internal settings @@ -115,17 +135,31 @@ class VariantImporter { import_stats_type stats_format; // Read/write fields - writer_interface_type* writer; // writer - + writer_interface_type* writer; // writer index_entry_type index_entry; // streaming index entry radix_sorter_type permutator; // GT permuter - header_type* header; // header gt_encoder_type encoder; // RLE packer compression_manager_type compression_manager; // Data container block_type block; + + // Map from BCF global FORMAT/INFO/FILTER IDX to local IDX such that + // FORMAT maps to [0, f-1], and INFO maps to [0, i-1] and FILTER to + // [0,l-1] and where f+i+l = n, where n is the total number of fields. + // + // Global Local + // std::unordered_map filter_reorder_map_; + reorder_map_type filter_reorder_map_; + reorder_map_type info_reorder_map_; + reorder_map_type format_reorder_map_; + reorder_map_type contig_reorder_map_; + + std::unique_ptr vcf_reader_; + vcf_container_type vcf_container_; + + hash_map_type block_hash_map; }; diff --git a/lib/variant_reader.cpp b/lib/variant_reader.cpp index 78ca942..1ea1fc3 100644 --- a/lib/variant_reader.cpp +++ b/lib/variant_reader.cpp @@ -2,18 +2,17 @@ namespace tachyon{ -VariantReader::VariantReader() : - filesize(0) +VariantReader::VariantReader() {} VariantReader::VariantReader(const std::string& filename) : - filesize(0) + basic_reader(filename) {} VariantReader::~VariantReader(){} VariantReader::VariantReader(const self_type& other) : - filesize(other.filesize), + basic_reader(other.basic_reader), block_settings(other.block_settings), settings(other.settings), global_header(other.global_header), @@ -22,7 +21,7 @@ VariantReader::VariantReader(const self_type& other) : checksums(other.checksums), keychain(other.keychain) { - this->stream.open(this->settings.input, std::ios::in | std::ios::binary); + this->basic_reader.open(); } bool VariantReader::open(void){ @@ -31,21 +30,23 @@ bool VariantReader::open(void){ return false; } - this->stream.open(this->settings.input, std::ios::binary | std::ios::in | std::ios::ate); - this->filesize = (U64)this->stream.tellg(); + if(this->basic_reader.open() == false){ + std::cerr << "Failed to open" << std::endl; + return false; + } - if(this->filesize <= YON_FOOTER_LENGTH){ + if(this->basic_reader.filesize_ <= YON_FOOTER_LENGTH){ std::cerr << utility::timestamp("ERROR") << "File is corrupted!" << std::endl; return false; } // Seek to start of footer - this->stream.seekg(this->filesize - YON_FOOTER_LENGTH); - if(!this->stream.good()){ + this->basic_reader.stream_.seekg((U64)this->basic_reader.filesize_ - YON_FOOTER_LENGTH); + if(!this->basic_reader.stream_.good()){ std::cerr << utility::timestamp("ERROR") << "Failed to seek in file!" << std::endl; return false; } - this->stream >> this->global_footer; + this->basic_reader.stream_ >> this->global_footer; // Validate footer if(this->global_footer.validate() == false){ @@ -53,14 +54,14 @@ bool VariantReader::open(void){ return false; } - if(!this->stream.good()){ + if(!this->basic_reader.stream_.good()){ std::cerr << utility::timestamp("ERROR") << "Failed to read file!" << std::endl; return false; } // Seek to start of file - this->stream.seekg(0); - if(!this->stream.good()){ + this->basic_reader.stream_.seekg(0); + if(!this->basic_reader.stream_.good()){ std::cerr << utility::timestamp("ERROR") << "Failed to rewind file!" << std::endl; return false; } @@ -68,29 +69,30 @@ bool VariantReader::open(void){ // Load header //this->stream >> this->global_header; char magic_string[tachyon::constants::FILE_HEADER_LENGTH]; - this->stream.read(&magic_string[0], tachyon::constants::FILE_HEADER_LENGTH); + this->basic_reader.stream_.read(&magic_string[0], tachyon::constants::FILE_HEADER_LENGTH); if(strncmp(&magic_string[0], &tachyon::constants::FILE_HEADER[0], tachyon::constants::FILE_HEADER_LENGTH) != 0){ std::cerr << utility::timestamp("ERROR") << "Failed to validate Tachyon magic string!" << std::endl; return false; } - containers::DataContainer header_container; - this->stream >> header_container.header; - header_container.buffer_data.resize(header_container.header.data_header.cLength); - this->stream.read(header_container.buffer_data.data(), header_container.header.data_header.cLength); - header_container.buffer_data.n_chars = header_container.header.data_header.cLength; - if(!this->codec_manager.zstd_codec.decompress(header_container)){ - std::cerr << utility::timestamp("ERROR") << "Failed to decompress header!" << std::endl; - return false; - } - header_container.buffer_data_uncompressed >> this->global_header; // parse header from buffer + uint32_t l_data = 0; + uint32_t l_c_data = 0; + utility::DeserializePrimitive(l_data, this->basic_reader.stream_); + utility::DeserializePrimitive(l_c_data, this->basic_reader.stream_); - if(!this->global_header.header_magic.validate()){ - std::cerr << utility::timestamp("ERROR") << "Failed to validate header!" << std::endl; + io::BasicBuffer header_uncompressed(l_data + 1024); + io::BasicBuffer header_compressed(l_c_data + 1024); header_compressed.n_chars = l_c_data; + + this->basic_reader.stream_.read(header_compressed.data(), l_c_data); + + if(!this->codec_manager.zstd_codec.Decompress(header_compressed, header_uncompressed)){ + std::cerr << utility::timestamp("ERROR") << "Failed to decompress header!" << std::endl; return false; } + assert(header_uncompressed.size() == l_data); + header_uncompressed >> this->global_header; // parse header from buffer - if(!this->stream.good()){ + if(!this->basic_reader.stream_.good()){ std::cerr << utility::timestamp("ERROR") << "Failed to get header!" << std::endl; return false; } @@ -98,61 +100,59 @@ bool VariantReader::open(void){ this->variant_container << this->global_header; // Keep track of start position - const U64 return_pos = this->stream.tellg(); - this->stream.seekg(this->global_footer.offset_end_of_data); - this->stream >> this->index; - this->stream >> this->checksums; - this->stream.seekg(return_pos); - - // Parse settings - this->getBlockSettings().parseSettings(this->global_header); + const U64 return_pos = this->basic_reader.stream_.tellg(); + this->basic_reader.stream_.seekg(this->global_footer.offset_end_of_data); + this->basic_reader.stream_ >> this->index; + this->basic_reader.stream_ >> this->checksums; + this->basic_reader.stream_.seekg(return_pos); - return(this->stream.good()); + return(this->basic_reader.stream_.good()); } -bool VariantReader::nextBlock(){ +bool VariantReader::NextBlock(){ // If the stream is faulty then return - if(!this->stream.good()){ + if(!this->basic_reader.stream_.good()){ std::cerr << utility::timestamp("ERROR", "IO") << "Corrupted! Input stream died prematurely!" << std::endl; return false; } // If the current position is the EOF then // exit the function - if((U64)this->stream.tellg() == this->global_footer.offset_end_of_data) + if((U64)this->basic_reader.stream_.tellg() == this->global_footer.offset_end_of_data) return false; // Reset and re-use this->variant_container.reset(); - if(!this->variant_container.getBlock().readHeaderFooter(this->stream)) + if(!this->variant_container.GetBlock().ReadHeaderFooter(this->basic_reader.stream_)) return false; - if(!this->codec_manager.zstd_codec.decompress(this->variant_container.getBlock().footer_support)){ + + if(!this->codec_manager.zstd_codec.Decompress(this->variant_container.GetBlock().footer_support)){ std::cerr << utility::timestamp("ERROR", "COMPRESSION") << "Failed decompression of footer!" << std::endl; } - this->variant_container.getBlock().footer_support.buffer_data_uncompressed >> this->variant_container.getBlock().footer; + this->variant_container.GetBlock().footer_support.buffer_data_uncompressed >> this->variant_container.GetBlock().footer; // Attempts to read a YON block with the settings provided - if(!this->variant_container.readBlock(this->stream, this->block_settings)) + if(!this->variant_container.ReadBlock(this->basic_reader.stream_, this->block_settings)) return false; // encryption manager ascertainment - if(this->variant_container.anyEncrypted()){ + if(this->variant_container.AnyEncrypted()){ if(this->keychain.size() == 0){ std::cerr << utility::timestamp("ERROR", "DECRYPTION") << "Data is encrypted but no keychain was provided!" << std::endl; return false; } - encryption::EncryptionDecorator e; - if(!e.decryptAES256(this->variant_container.getBlock(), this->keychain)){ + encryption_manager_type encryption_manager; + if(!encryption_manager.decryptAES256(this->variant_container.GetBlock(), this->keychain)){ std::cerr << utility::timestamp("ERROR", "DECRYPTION") << "Failed decryption!" << std::endl; return false; } } // Internally decompress available data - if(!this->codec_manager.decompress(this->variant_container.getBlock())){ + if(!this->codec_manager.Decompress(this->variant_container.GetBlock())){ std::cerr << utility::timestamp("ERROR", "COMPRESSION") << "Failed decompression!" << std::endl; return false; } @@ -161,981 +161,504 @@ bool VariantReader::nextBlock(){ return true; } -bool VariantReader::getBlock(const index_entry_type& index_entry){ - // If the stream is faulty then return - if(!this->stream.good()){ +bool VariantReader::GetBlock(const index_entry_type& index_entry){ + // If the stream is not good then return. + if(!this->basic_reader.stream_.good()){ std::cerr << utility::timestamp("ERROR", "IO") << "Corrupted! Input stream died prematurely!" << std::endl; return false; } - // Reset and re-use - this->variant_container.getBlock().clear(); - - // Seek to target block ID with the help of the index - this->stream.seekg(index_entry.byte_offset); - if(this->stream.good() == false){ + // Seek to target block id with the help of the linear index. + this->basic_reader.stream_.seekg(index_entry.byte_offset); + if(this->basic_reader.stream_.good() == false){ std::cerr << utility::timestamp("ERROR", "IO") << "Failed to seek to given offset using target index entry!" << std::endl; return(false); } - if(!this->variant_container.getBlock().readHeaderFooter(this->stream)) - return false; - - if(!this->codec_manager.zstd_codec.decompress(this->variant_container.getBlock().footer_support)){ - std::cerr << utility::timestamp("ERROR", "COMPRESSION") << "Failed decompression of footer!" << std::endl; - return(false); - } - this->variant_container.getBlock().footer_support.buffer_data_uncompressed >> this->variant_container.getBlock().footer; - - // Attempts to read a YON block with the provided - if(!this->variant_container.readBlock(this->stream, this->block_settings)) - return false; - - // encryption manager ascertainment - if(this->variant_container.getBlock().header.controller.anyEncrypted){ - if(this->keychain.size() == 0){ - std::cerr << utility::timestamp("ERROR", "DECRYPTION") << "Data is encrypted but no keychain was provided!" << std::endl; - return false; - } - - encryption::EncryptionDecorator e; - if(!e.decryptAES256(this->variant_container.getBlock(), this->keychain)){ - std::cerr << utility::timestamp("ERROR", "DECRYPTION") << "Failed decryption!" << std::endl; - return false; - } - } - - // Internally decompress available data - if(!this->codec_manager.decompress(this->variant_container.getBlock())){ - std::cerr << utility::timestamp("ERROR", "COMPRESSION") << "Failed decompression!" << std::endl; - return false; - } - - // All passed - return true; + // Load the next block if possible. + return(this->NextBlock()); } -containers::VariantBlockContainer VariantReader::getBlock(){ - // If the stream is faulty then return - if(!this->stream.good()){ - std::cerr << utility::timestamp("ERROR", "IO") << "Corrupted! Input stream died prematurely!" << std::endl; - return variant_container_type(); - } - - // If the current position is the EOF then - // exit the function - if((U64)this->stream.tellg() == this->global_footer.offset_end_of_data) - return variant_container_type(); - - // Reset and re-use - containers::VariantBlockContainer block; - - if(!block.getBlock().readHeaderFooter(this->stream)) - return variant_container_type(); - - if(!this->codec_manager.zstd_codec.decompress(block.getBlock().footer_support)){ - std::cerr << utility::timestamp("ERROR", "COMPRESSION") << "Failed decompression of footer!" << std::endl; - } - block.getBlock().footer_support.buffer_data_uncompressed >> block.getBlock().footer; +TACHYON_VARIANT_CLASSIFICATION_TYPE VariantReader::ClassifyVariant(const meta_entry_type& meta, const U32& allele) const{ + const S32 ref_size = meta.alleles[0].size(); + const S32 diff = ref_size - meta.alleles[allele].size(); + //std::cerr << diff << ","; + if(meta.alleles[0].allele[0] == '<' || meta.alleles[allele].allele[0] == '<') return(YON_VARIANT_CLASS_SV); + else if(diff == 0){ + if(ref_size == 1 && meta.alleles[0].allele[0] != meta.alleles[allele].allele[0]){ + if(meta.alleles[allele].allele[0] == 'A' || meta.alleles[allele].allele[0] == 'T' || meta.alleles[allele].allele[0] == 'G' || meta.alleles[allele].allele[0] == 'C') + return(YON_VARIANT_CLASS_SNP); + else return(YON_VARIANT_CLASS_UNKNOWN); + } + else if(ref_size != 1){ + U32 characters_identical = 0; + const U32 length_shortest = ref_size < meta.alleles[allele].size() ? ref_size : meta.alleles[allele].size(); - // Attempts to read a YON block with the settings provided - if(!block.readBlock(this->stream, this->getBlockSettings())) - return variant_container_type(); + for(U32 c = 0; c < length_shortest; ++c){ + characters_identical += (meta.alleles[0].allele[c] == meta.alleles[allele].allele[c]); + } - // encryption manager ascertainment - if(block.anyEncrypted()){ - if(this->keychain.size() == 0){ - std::cerr << utility::timestamp("ERROR", "DECRYPTION") << "Data is encrypted but no keychain was provided!" << std::endl; - return variant_container_type(); + if(characters_identical == 0) return(YON_VARIANT_CLASS_MNP); + else return(YON_VARIANT_CLASS_CLUMPED); } - - encryption::EncryptionDecorator e; - if(!e.decryptAES256(block.getBlock(), this->keychain)){ - std::cerr << utility::timestamp("ERROR", "DECRYPTION") << "Failed decryption!" << std::endl; - return variant_container_type(); + } else { + const U32 length_shortest = ref_size < meta.alleles[allele].size() ? ref_size : meta.alleles[allele].size(); + U32 characters_non_standard = 0; + for(U32 c = 0; c < length_shortest; ++c){ + characters_non_standard += (meta.alleles[allele].allele[c] != 'A' && meta.alleles[allele].allele[c] != 'T' && meta.alleles[allele].allele[c] != 'C' && meta.alleles[allele].allele[c] !='G'); } + if(characters_non_standard) return(YON_VARIANT_CLASS_UNKNOWN); + else return(YON_VARIANT_CLASS_INDEL); } - - // Internally decompress available data - if(!this->codec_manager.decompress(block.getBlock())){ - std::cerr << utility::timestamp("ERROR", "COMPRESSION") << "Failed decompression!" << std::endl; - return variant_container_type(); - } - - // All passed - return block; + return(YON_VARIANT_CLASS_UNKNOWN); } -const int VariantReader::has_format_field(const std::string& field_name) const{ - core::HeaderMapEntry* match = nullptr; - if(this->global_header.getFormatField(field_name, match)){ - return(match->IDX); - } - return(-2); -} +void VariantReader::OuputVcfWrapper(io::BasicBuffer& output_buffer, yon1_t& entry) const{ + utility::to_vcf_string(output_buffer, '\t', *entry.meta, this->global_header); + output_buffer += '\t'; -const int VariantReader::has_info_field(const std::string& field_name) const{ - core::HeaderMapEntry* match = nullptr; - if(this->global_header.getInfoField(field_name, match)){ - return(match->IDX); - } - return(-2); -} + // Print Filter, Info, and Format if available. + this->OutputFilterVcf(output_buffer, entry); + this->OutputInfoVcf(output_buffer, entry); + this->OutputFormatVcf(output_buffer, entry); + output_buffer += '\n'; -const int VariantReader::has_filter_field(const std::string& field_name) const{ - core::HeaderMapEntry* match = nullptr; - if(this->global_header.getFilterField(field_name, match)){ - return(match->IDX); + if(output_buffer.size() > 65536){ + std::cout.write(output_buffer.data(), output_buffer.size()); + output_buffer.reset(); } - return(-2); } -VariantReaderObjects& VariantReader::loadObjects(objects_type& objects) const{ - // New meta container - objects.meta_container = new meta_container_type(this->variant_container.getBlock()); - - // New genotype containers if aplicable - if(this->variant_container.getBlock().header.controller.hasGT && block_settings.genotypes_all.load){ - objects.genotype_container = new gt_container_type(this->variant_container.getBlock(), *objects.meta_container); - objects.genotype_summary = new objects_type::genotype_summary_type(10); - } - - // FORMAT-specific containers - // Store as double pointers to avoid memory collisions because - // FORMAT containers have different intrinsic class members - objects.n_loaded_format = this->variant_container.getMapper().getNumberFormatLoaded(); - objects.format_containers = new format_interface_type*[objects.n_loaded_format]; - - if(objects.n_loaded_format){ - for(U32 i = 0; i < objects.n_loaded_format; ++i){ - const U32 global_key = this->variant_container.getMapper().getLoadedFormat(i).stream_id_global; - - // Pattern matches of GLOBAL in LOCAL - // This evaluated the boolean set-membership of GLOBAL key in the FORMAT patterns - std::vector matches = this->variant_container.get_format_field_pattern_matches(this->global_header.format_fields[global_key].ID); - - if(this->global_header.format_fields[global_key].getType() == YON_VCF_HEADER_INTEGER){ - objects.format_containers[i] = new containers::FormatContainer(this->variant_container.getBlock().format_containers[i], *objects.meta_container, matches, this->global_header.getSampleNumber()); - objects.format_field_names.push_back(this->global_header.format_fields[global_key].ID); - } else if(this->global_header.format_fields[global_key].getType() == YON_VCF_HEADER_STRING || - this->global_header.format_fields[global_key].getType() == YON_VCF_HEADER_CHARACTER){ - objects.format_containers[i] = new containers::FormatContainer(this->variant_container.getBlock().format_containers[i], *objects.meta_container, matches, this->global_header.getSampleNumber()); - objects.format_field_names.push_back(this->global_header.format_fields[global_key].ID); - } else if(this->global_header.format_fields[global_key].getType() == YON_VCF_HEADER_FLOAT){ - objects.format_containers[i] = new containers::FormatContainer(this->variant_container.getBlock().format_containers[i], *objects.meta_container, matches, this->global_header.getSampleNumber()); - objects.format_field_names.push_back(this->global_header.format_fields[global_key].ID); +void VariantReader::OutputInfoVcf(io::BasicBuffer& output_buffer, yon1_t& entry) const{ + // Print Info. + if(entry.n_info){ + const uint32_t n_info_avail = entry.info_ids->size(); + if(n_info_avail){ + if(entry.info_hdr[0]->yon_type == YON_VCF_HEADER_FLAG){ + output_buffer += entry.info_hdr[0]->id; } else { - objects.format_containers[i] = new containers::FormatContainer; - objects.format_field_names.push_back(this->global_header.format_fields[global_key].ID); + output_buffer += entry.info_hdr[0]->id; + output_buffer += '='; + entry.info[0]->to_vcf_string(output_buffer); } - } - } - // INFO-specific containers - // Store as double pointers to avoid memory collisions because - // INFO containers have different class members - objects.n_loaded_info = this->variant_container.getMapper().getNumberInfoLoaded(); - objects.info_containers = new info_interface_type*[objects.n_loaded_info]; - - if(objects.n_loaded_info){ - for(U32 i = 0; i < objects.n_loaded_info; ++i){ - const U32 global_key = this->variant_container.getMapper().getLoadedInfo(i).stream_id_global; - - // Pattern matches of GLOBAL in LOCAL - // This evaluated the boolean set-membership of GLOBAL key in the FORMAT patterns - std::vector matches = this->variant_container.get_info_field_pattern_matches(this->global_header.info_fields[global_key].ID); - - if(this->global_header.info_fields[global_key].getType() == YON_VCF_HEADER_INTEGER){ - objects.info_containers[i] = new containers::InfoContainer(this->variant_container.getBlock().info_containers[i], *objects.meta_container, matches); - objects.info_field_names.push_back(this->global_header.info_fields[global_key].ID); - } else if(this->global_header.info_fields[global_key].getType() == YON_VCF_HEADER_STRING || - this->global_header.info_fields[global_key].getType() == YON_VCF_HEADER_CHARACTER){ - objects.info_containers[i] = new containers::InfoContainer(this->variant_container.getBlock().info_containers[i], *objects.meta_container, matches); - objects.info_field_names.push_back(this->global_header.info_fields[global_key].ID); - } else if(this->global_header.info_fields[global_key].getType() == YON_VCF_HEADER_FLOAT){ - objects.info_containers[i] = new containers::InfoContainer(this->variant_container.getBlock().info_containers[i], *objects.meta_container, matches); - objects.info_field_names.push_back(this->global_header.info_fields[global_key].ID); - } else { - objects.info_containers[i] = new containers::InfoContainer(); - objects.info_field_names.push_back(this->global_header.info_fields[global_key].ID); + for(U32 j = 1; j < n_info_avail; ++j){ + output_buffer += ';'; + if(entry.info_hdr[j]->yon_type == YON_VCF_HEADER_FLAG){ + output_buffer += entry.info_hdr[j]->id; + } else { + output_buffer += entry.info_hdr[j]->id; + output_buffer += '='; + entry.info[j]->to_vcf_string(output_buffer); + } + } + + if(this->GetBlockSettings().annotate_extra){ + entry.EvaluateSummary(true); + entry.gt_sum->d->PrintVcf(output_buffer); } } + } else { + if(this->GetBlockSettings().annotate_extra){ + entry.EvaluateSummary(true); + entry.gt_sum->d->PrintVcf(output_buffer); + } else + output_buffer += '.'; } +} +void VariantReader::OutputFormatVcf(io::BasicBuffer& output_buffer, const yon1_t& entry) const{ + if(entry.n_format){ + output_buffer += '\t'; - // If we want to drop records that do not have all/any of the fields we desire - // then we create a vector of size N_PATTERNS and set those that MATCH to TRUE - // this allows for filtering in O(1)-time - // - // This vector stores the number of INFO fields having set membership with this - // particular hash pattern - objects.info_id_fields_keep = std::vector(this->variant_container.getBlock().footer.n_info_patterns, 0); - objects.format_id_fields_keep = std::vector(this->variant_container.getBlock().footer.n_format_patterns, 0); - - // This vector of vectors keeps the local INFO identifiers for the matched global - // identifiers in a given hash pattern - // - // For example: x[5] contains the local IDs for loaded INFO streams for pattern ID 5 - objects.local_match_keychain_info = std::vector< std::vector >(this->variant_container.getBlock().footer.n_info_patterns); - objects.local_match_keychain_format = std::vector< std::vector >(this->variant_container.getBlock().footer.n_format_patterns); - - // If loading all INFO values then return them in the ORIGINAL order - if(this->block_settings.info_all.load){ - for(U32 i = 0; i < this->variant_container.getBlock().footer.n_info_patterns; ++i){ // Number of info patterns - for(U32 j = 0; j < this->variant_container.getBlock().footer.info_bit_vectors[i].n_keys; ++j){ // Number of keys in pattern [i] - for(U32 k = 0; k < objects.n_loaded_info; ++k){ // Number of loaded INFO identifiers - // Global - if(this->variant_container.getBlock().footer.info_offsets[this->variant_container.getBlock().footer.info_bit_vectors[i].local_keys[j]].data_header.global_key == this->variant_container.getMapper().getLoadedInfo(k).offset->data_header.global_key){ - objects.local_match_keychain_info[i].push_back(k); - ++objects.info_id_fields_keep[i]; - } - } + const uint32_t n_format_avail = entry.format_ids->size(); + if(n_format_avail){ + output_buffer += entry.format_hdr[0]->id; + for(U32 j = 1; j < n_format_avail; ++j){ + output_buffer += ':'; + output_buffer += entry.format_hdr[j]->id; } - } - } - // If loading custom INFO fields then return them in the REQUESTED order - else { - for(U32 i = 0; i < this->variant_container.getBlock().footer.n_info_patterns; ++i){ // i = Number of info patterns - for(U32 k = 0; k < objects.n_loaded_info; ++k){ // k = Number of loaded INFO identifiers - for(U32 j = 0; j < this->variant_container.getBlock().footer.info_bit_vectors[i].n_keys; ++j){ // j = Number of keys in pattern [i] - if(this->variant_container.getBlock().footer.info_offsets[this->variant_container.getBlock().footer.info_bit_vectors[i].local_keys[j]].data_header.global_key == this->variant_container.getMapper().getLoadedInfo(k).offset->data_header.global_key){ - objects.local_match_keychain_info[i].push_back(k); - ++objects.info_id_fields_keep[i]; - } + output_buffer += '\t'; + + // Vcf FORMAT values are interleaved such that values are + // presented in a columnar representation with data for + // each sample is concatenated together such as the pattern + // GT:AF:AS is display for three samples as: + // + // Sample 1 Sample 2 Sample 3 + // 0|0:0.5|ABC 0|0:0.5|ABC 0|0:0.5|ABC + // + // This memory layout requires the interleaving of the + // internally separated data streams resulting in slower + // Vcf printing speeds compared to the naive Bcf file format. + // + // First calculate the FORMAT:GT field for this variant site. + // Case when the only available FORMAT field is the GT field. + if(n_format_avail == 1 && entry.is_loaded_gt && + entry.meta->controller.gt_available && + (this->GetBlockSettings().display_static & YON_BLK_BV_GT)) + { + entry.gt->ExpandExternal(this->variant_container.GetAllocatedGenotypeMemory()); + entry.gt->d_exp = this->variant_container.GetAllocatedGenotypeMemory(); + + // Iterate over samples and print FORMAT:GT value in Vcf format. + entry.gt->d_exp[0]->PrintVcf(output_buffer, entry.gt->m); + for(U32 s = 1; s < this->global_header.GetNumberSamples(); ++s){ + output_buffer += '\t'; + entry.gt->d_exp[s]->PrintVcf(output_buffer, entry.gt->m); } - } - } - } - // For FORMAT - // If loading all FORMAT values then return them in the ORIGINAL order - if(this->block_settings.format_all.load){ - for(U32 i = 0; i < this->variant_container.getBlock().footer.n_format_patterns; ++i){ // Number of info patterns - for(U32 j = 0; j < this->variant_container.getBlock().footer.format_bit_vectors[i].n_keys; ++j){ // Number of keys in pattern [i] - for(U32 k = 0; k < objects.n_loaded_format; ++k){ // Number of loaded INFO identifiers - if(this->variant_container.getBlock().footer.format_offsets[this->variant_container.getBlock().footer.format_bit_vectors[i].local_keys[j]].data_header.global_key == this->variant_container.getMapper().getLoadedFormat(k).offset->data_header.global_key){ - objects.local_match_keychain_format[i].push_back(k); - ++objects.format_id_fields_keep[i]; - } - } + entry.gt->d_exp = nullptr; } - } - } - // If loading custom FORMAT fields then return them in the REQUESTED order - else { - for(U32 i = 0; i < this->variant_container.getBlock().footer.n_format_patterns; ++i){ // i = Number of info patterns - for(U32 k = 0; k < objects.n_loaded_format; ++k){ // k = Number of loaded INFO identifiers - for(U32 j = 0; j < this->variant_container.getBlock().footer.format_bit_vectors[i].n_keys; ++j){ // j = Number of keys in pattern [i] - if(this->variant_container.getBlock().footer.format_offsets[this->variant_container.getBlock().footer.format_bit_vectors[i].local_keys[j]].data_header.global_key == this->variant_container.getMapper().getLoadedFormat(k).offset->data_header.global_key){ - objects.local_match_keychain_format[i].push_back(k); - ++objects.format_id_fields_keep[i]; + // Case when there are > 1 Vcf Format fields and the GT field + // is available. + else if(n_format_avail > 1 && entry.is_loaded_gt && + entry.meta->controller.gt_available && + (this->GetBlockSettings().display_static & YON_BLK_BV_GT)) + { + entry.gt->ExpandExternal(this->variant_container.GetAllocatedGenotypeMemory()); + entry.gt->d_exp = this->variant_container.GetAllocatedGenotypeMemory(); + + entry.gt->d_exp[0]->PrintVcf(output_buffer, entry.gt->m); + for(U32 g = 1; g < n_format_avail; ++g){ + output_buffer += ':'; + entry.fmt[g]->to_vcf_string(output_buffer, 0); + } + for(U32 s = 1; s < this->global_header.GetNumberSamples(); ++s){ + output_buffer += '\t'; + entry.gt->d_exp[s]->PrintVcf(output_buffer, entry.gt->m); + for(U32 g = 1; g < n_format_avail; ++g){ + output_buffer += ':'; + entry.fmt[g]->to_vcf_string(output_buffer, s); } } - } - } - } - - // If we want to compute additional genotypic summary statistics (triggered by -X flag) - // then we need to make sure we don't accidentally add annotations to fields that already - // exists (e.g. if INFO=AC already exists) - // - // Preprocessing step: - // Cycle over INFO patterns and see if any of the custom FIELDS are set - // FS_A, AN, NM, NPM, AC, AC_FW, AC_REV, AF, HWE_P, VT, MULTI_ALLELIC - std::vector ADDITIONAL_INFO = {"FS_A", "AN", "NM", "NPM", "AC", "AC_FW", "AC_REV", "AF", "HWE_P", "VT", "MULTI_ALLELIC", "F_PIC"}; - U16 execute_mask = 0; - - // Step 1: Find INFO - std::vector< std::pair > additional_local_keys_found; - for(U32 i = 0; i < ADDITIONAL_INFO.size(); ++i){ - if(this->global_header.has_info_field(ADDITIONAL_INFO[i])){ - const core::HeaderMapEntry* map = this->global_header.getInfoField(ADDITIONAL_INFO[i]); - // Find local key - for(U32 k = 0; k < objects.n_loaded_info; ++k){ - if(this->variant_container.getBlock().info_containers[k].header.getGlobalKey() == map->IDX){ - execute_mask |= 1 << i; - additional_local_keys_found.push_back(std::pair(k,i)); - } + entry.gt->d_exp = nullptr; } - } - } + // All other cases. + else { + entry.fmt[0]->to_vcf_string(output_buffer, 0); + for(U32 g = 1; g < n_format_avail; ++g){ + output_buffer += ':'; + entry.fmt[g]->to_vcf_string(output_buffer, 0); + } - // Step 2: Cycle over patterns to find existing INFO fields - // Cycle over INFO patterns - objects.additional_info_execute_flag_set = std::vector< U16 >(1, 65535); - if(ADDITIONAL_INFO.size()){ - objects.additional_info_execute_flag_set.reserve(this->variant_container.getBlock().footer.n_info_patterns); - for(U32 i = 0; i < this->variant_container.getBlock().footer.n_info_patterns; ++i){ - objects.additional_info_execute_flag_set[i] = (1 << ADDITIONAL_INFO.size()) - 1; - for(U32 j = 0; j < additional_local_keys_found.size(); ++j){ - if(this->variant_container.getBlock().footer.info_bit_vectors[i][j]){ - objects.additional_info_execute_flag_set[i] &= ~(1 << additional_local_keys_found[j].second); + for(U32 s = 1; s < this->global_header.GetNumberSamples(); ++s){ + output_buffer += '\t'; + entry.fmt[0]->to_vcf_string(output_buffer, s); + for(U32 g = 1; g < n_format_avail; ++g){ + output_buffer += ':'; + entry.fmt[g]->to_vcf_string(output_buffer, s); + } } } } } - - // - - return(objects); } -void VariantReader::printFILTER(buffer_type& outputBuffer, - const U32& position, - const objects_type& objects) const -{ - if(outputBuffer.back() != '\t') outputBuffer += '\t'; - - if(this->block_settings.display_filter && this->variant_container.getBlock().footer.n_filter_streams){ - const U32& n_filter_keys = this->variant_container.getBlock().footer.filter_bit_vectors[(*objects.meta_container)[position].filter_pattern_id].n_keys; - const U32* filter_keys = this->variant_container.getBlock().footer.filter_bit_vectors[(*objects.meta_container)[position].filter_pattern_id].local_keys; - if(n_filter_keys){ - // Local key -> global key - outputBuffer += this->global_header.filter_fields[this->variant_container.getBlock().footer.filter_offsets[filter_keys[0]].data_header.global_key].ID; - for(U32 i = 1; i < n_filter_keys; ++i){ - outputBuffer += ';'; - outputBuffer += this->global_header.filter_fields[this->variant_container.getBlock().footer.filter_offsets[filter_keys[i]].data_header.global_key].ID; +void VariantReader::OutputFilterVcf(io::BasicBuffer& output_buffer, const yon1_t& entry) const{ + if(entry.n_filter){ + const uint32_t n_filter_avail = entry.filter_ids->size(); + if(n_filter_avail){ + output_buffer += entry.filter_hdr[0]->id; + for(U32 j = 1; j < n_filter_avail; ++j){ + output_buffer += ';'; + output_buffer += entry.filter_hdr[j]->id; } - } else - outputBuffer += '.'; - } else { - outputBuffer += '.'; - } + } else { + output_buffer += '.'; + } + } else output_buffer += '.'; + output_buffer += '\t'; } -void VariantReader::printFILTERCustom(buffer_type& outputBuffer, - const U32& position, - const objects_type& objects) const -{ - if(this->block_settings.display_filter && this->variant_container.getBlock().footer.n_filter_streams){ - const U32& n_filter_keys = this->variant_container.getBlock().footer.filter_bit_vectors[(*objects.meta_container)[position].filter_pattern_id].n_keys; - const U32* filter_keys = this->variant_container.getBlock().footer.filter_bit_vectors[(*objects.meta_container)[position].filter_pattern_id].local_keys; - if(n_filter_keys){ - if(outputBuffer.back() != this->block_settings.custom_delimiter_char) outputBuffer += this->block_settings.custom_delimiter_char; - - // Local key -> global key - outputBuffer += this->global_header.filter_fields[this->variant_container.getBlock().footer.filter_offsets[filter_keys[0]].data_header.global_key].ID; - for(U32 i = 1; i < n_filter_keys; ++i){ - outputBuffer += ';'; - outputBuffer += this->global_header.filter_fields[this->variant_container.getBlock().footer.filter_offsets[filter_keys[i]].data_header.global_key].ID; - } - } else - outputBuffer += '.'; - } else { - outputBuffer += '.'; +U64 VariantReader::OutputVcfLinear(void){ + this->variant_container.AllocateGenotypeMemory(); + // temp + //if(this->occ_table.ReadTable("/media/mdrk/NVMe/1kgp3/populations/integrated_call_samples_v3.20130502.ALL.panel", this->GetGlobalHeader(), '\t') == false){ + // return(0); + //} + + while(this->NextBlock()){ + objects_type* objects = this->GetCurrentContainer().LoadObjects(this->block_settings); + yon1_t* entries = this->GetCurrentContainer().LazyEvaluate(*objects); + io::BasicBuffer output_buffer(100000); + // If occ table is built. + //objects->occ = &occ; + //objects->EvaluateOcc(this->GetCurrentContainer().GetBlock().gt_ppa); + + for(U32 i = 0; i < objects->meta_container->size(); ++i){ + if(this->variant_filters.filter(entries[i], i) == false) + continue; + + // Each entry evaluate occ if available. + //entries[i].occ = objects->occ; + //entries[i].EvaluateOcc(); + + this->OuputVcfWrapper(output_buffer, entries[i]); + } + + std::cout.write(output_buffer.data(), output_buffer.size()); + output_buffer.reset(); + delete [] entries; + //objects->occ = nullptr; + delete objects; } + + return 0; } +U64 VariantReader::OutputVcfSearch(void){ + this->variant_container.AllocateGenotypeMemory(); -void VariantReader::printFILTERJSON(buffer_type& outputBuffer, - const U32& position, - const objects_type& objects) const -{ - if(this->variant_container.getBlock().footer.n_filter_streams){ - const U32& n_filter_keys = this->variant_container.getBlock().footer.filter_bit_vectors[(*objects.meta_container)[position].filter_pattern_id].n_keys; - const U32* filter_keys = this->variant_container.getBlock().footer.filter_bit_vectors[(*objects.meta_container)[position].filter_pattern_id].local_keys; - if(n_filter_keys){ - if(outputBuffer.back() != ',') outputBuffer += ','; - // Local key -> global key - outputBuffer += "\"FILTER-"; - outputBuffer += this->global_header.filter_fields[this->variant_container.getBlock().footer.filter_offsets[filter_keys[0]].data_header.global_key].ID; - outputBuffer += "\":true"; - for(U32 i = 1; i < n_filter_keys; ++i){ - outputBuffer += ','; - outputBuffer += "\"FILTER-"; - outputBuffer += this->global_header.filter_fields[this->variant_container.getBlock().footer.filter_offsets[filter_keys[i]].data_header.global_key].ID; - outputBuffer += "\":true"; - } - } - } -} + // Filter functionality + filter_intervals_function filter_intervals = &self_type::FilterIntervals; -void VariantReader::printFORMATVCF(buffer_type& buffer, - const char& delimiter, - const U32& position, - const objects_type& objects, - std::vector& genotypes_unpermuted) const -{ - if(this->block_settings.format_all.display && objects.n_loaded_format){ - if(objects.n_loaded_format){ - const U32& n_format_keys = this->variant_container.getBlock().footer.format_bit_vectors[objects.meta_container->at(position).format_pattern_id].n_keys; - const U32* format_keys = this->variant_container.getBlock().footer.format_bit_vectors[objects.meta_container->at(position).format_pattern_id].local_keys; - if(n_format_keys){ - if(buffer.back() != delimiter) buffer += delimiter; - - // Print key map - buffer += this->global_header.format_fields[this->variant_container.getBlock().footer.format_offsets[format_keys[0]].data_header.global_key].ID; - for(U32 i = 1; i < n_format_keys; ++i){ - buffer += ':'; - buffer += this->global_header.format_fields[this->variant_container.getBlock().footer.format_offsets[format_keys[i]].data_header.global_key].ID; - } - buffer += delimiter; - - // Todo: print if no GT data - // Begin print FORMAT data for each sample - if(this->variant_container.getBlock().header.controller.hasGT){ - if(this->block_settings.ppa.load && this->variant_container.getBlock().header.controller.hasGTPermuted){ - objects.genotype_container->at(position).getObjects(this->global_header.getSampleNumber(), genotypes_unpermuted, this->variant_container.getBlock().ppa_manager); - } else { - objects.genotype_container->at(position).getObjects(this->global_header.getSampleNumber(), genotypes_unpermuted); - } + for(U32 i = 0; i < this->interval_container.GetBlockList().size(); ++i){ + this->GetBlock(this->interval_container.GetBlockList()[i]); - buffer << genotypes_unpermuted[0]; - for(U32 i = 1; i < n_format_keys; ++i){ - buffer += ':'; - objects.format_containers[format_keys[i]]->to_vcf_string(buffer, position, 0); - } + objects_type* objects = this->GetCurrentContainer().LoadObjects(this->block_settings); + yon1_t* entries = this->GetCurrentContainer().LazyEvaluate(*objects); + io::BasicBuffer output_buffer(100000); - for(U64 s = 1; s < this->global_header.getSampleNumber(); ++s){ - buffer += delimiter; - buffer << genotypes_unpermuted[s]; - for(U32 i = 1; i < n_format_keys; ++i){ - buffer += ':'; - objects.format_containers[format_keys[i]]->to_vcf_string(buffer, position, s); - } - } - } else { // No genotype data available - objects.format_containers[format_keys[0]]->to_vcf_string(buffer, position, 0); - for(U32 i = 1; i < n_format_keys; ++i){ - buffer += ':'; - objects.format_containers[format_keys[i]]->to_vcf_string(buffer, position, 0); - } + for(U32 i = 0; i < objects->meta_container->size(); ++i){ + if((this->*filter_intervals)(objects->meta_container->at(i)) == false) + continue; - for(U64 s = 1; s < this->global_header.getSampleNumber(); ++s){ - buffer += delimiter; - objects.format_containers[format_keys[0]]->to_vcf_string(buffer, position, s); - for(U32 i = 1; i < n_format_keys; ++i){ - buffer += ':'; - objects.format_containers[format_keys[i]]->to_vcf_string(buffer, position, s); - } - } - } + if(this->variant_filters.filter(entries[i], i) == false) + continue; - } else { // have no keys - if(buffer.back() != delimiter) buffer += delimiter; - buffer += "."; - //buffer += delimiter; - } + this->OuputVcfWrapper(output_buffer, entries[i]); } + + std::cout.write(output_buffer.data(), output_buffer.size()); + output_buffer.reset(); + delete [] entries; + delete objects; } -} -void VariantReader::printFORMATVCF(buffer_type& buffer, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const -{ - return(this->printFORMATVCF(buffer, '\t', position, objects, genotypes_unpermuted)); + return 0; } -void VariantReader::printFORMATCustom(buffer_type& outputBuffer, - const char& delimiter, - const U32& position, - const objects_type& objects, - std::vector& genotypes_unpermuted) const -{ - if(block_settings.format_all.display || objects.n_loaded_format){ - const std::vector& targetKeys = objects.local_match_keychain_format[objects.meta_container->at(position).format_pattern_id]; - if(targetKeys.size()){ - if(outputBuffer.back() != delimiter) outputBuffer += delimiter; - - // Print key map - outputBuffer += objects.format_field_names[targetKeys[0]]; - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += ':'; - outputBuffer += objects.format_field_names[targetKeys[i]]; - }; - - outputBuffer += delimiter; - - // First individual - //if(this->global_header.getSampleNumber() > 1) outputBuffer += '['; - - //if(targetKeys.size() > 1) outputBuffer += '['; - objects.format_containers[targetKeys[0]]->to_vcf_string(outputBuffer, position, 0); - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += ':'; - objects.format_containers[targetKeys[i]]->to_vcf_string(outputBuffer, position, 0); - } - //if(targetKeys.size() > 1) outputBuffer += ']'; - - for(U64 s = 1; s < this->global_header.getSampleNumber(); ++s){ - outputBuffer += delimiter; - //if(targetKeys.size() > 1) outputBuffer += '['; - objects.format_containers[targetKeys[0]]->to_vcf_string(outputBuffer, position, s); - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += ':'; - objects.format_containers[targetKeys[i]]->to_vcf_string(outputBuffer, position, s); - } - //if(targetKeys.size() > 1) outputBuffer += ']'; - } +U64 VariantReader::OutputRecords(void){ + this->interval_container.Build(this->global_header); - - //if(this->global_header.getSampleNumber() > 1) outputBuffer += ']'; - } + if(this->settings.use_htslib){ + if(this->interval_container.size()) return(this->OutputHtslibVcfSearch()); + else return(this->OutputHtslibVcfLinear()); } -} -void VariantReader::printFORMATCustomVector(buffer_type& outputBuffer, - const char& delimiter, - const U32& position, - const objects_type& objects, - std::vector& genotypes_unpermuted) const -{ - if(block_settings.format_all.display || objects.n_loaded_format){ - const std::vector& targetKeys = objects.local_match_keychain_format[objects.meta_container->at(position).format_pattern_id]; - if(targetKeys.size()){ - if(outputBuffer.back() != delimiter) outputBuffer += delimiter; - - // Print key map - outputBuffer += objects.format_field_names[targetKeys[0]]; - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += ':'; - outputBuffer += objects.format_field_names[targetKeys[i]]; - }; - outputBuffer += delimiter; - - // First key - // Cycle over keys - objects.format_containers[targetKeys[0]]->to_vcf_string(outputBuffer, position, 0); - // Cycle over samples - for(U64 s = 1; s < this->global_header.getSampleNumber(); ++s){ - outputBuffer += ','; - objects.format_containers[targetKeys[0]]->to_vcf_string(outputBuffer, position, s); - } + if(this->GetBlockSettings().show_vcf_header) + this->global_header.PrintVcfHeader(std::cout); - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += delimiter; - objects.format_containers[targetKeys[i]]->to_vcf_string(outputBuffer, position, 0); - // Cycle over samples - for(U64 s = 1; s < this->global_header.getSampleNumber(); ++s){ - outputBuffer += ','; - objects.format_containers[targetKeys[i]]->to_vcf_string(outputBuffer, position, s); - } - } - } - } + if(this->interval_container.size()) return(this->OutputVcfSearch()); + else return(this->OutputVcfLinear()); } -void VariantReader::printFORMATCustomVectorJSON(buffer_type& outputBuffer, - const char& delimiter, - const U32& position, - const objects_type& objects, - std::vector& genotypes_unpermuted) const -{ - if(block_settings.format_all.display || objects.n_loaded_format){ - const std::vector& targetKeys = objects.local_match_keychain_format[objects.meta_container->at(position).format_pattern_id]; - if(targetKeys.size()){ - if(outputBuffer.back() != ',') outputBuffer += ','; - // First key - // Cycle over keys - outputBuffer += "\"FORMAT-"; - outputBuffer += objects.format_field_names[targetKeys[0]]; - outputBuffer += '"'; - outputBuffer += ':'; - if(this->global_header.getSampleNumber() > 1) outputBuffer += '['; - objects.format_containers[targetKeys[0]]->to_json_string(outputBuffer, position, 0); - // Cycle over samples - for(U64 s = 1; s < this->global_header.getSampleNumber(); ++s){ - outputBuffer += ','; - objects.format_containers[targetKeys[0]]->to_json_string(outputBuffer, position, s); - } - if(this->global_header.getSampleNumber() > 1) outputBuffer += ']'; - - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += ','; - outputBuffer += "\"FORMAT-"; - outputBuffer += objects.format_field_names[targetKeys[i]]; - outputBuffer += '"'; - outputBuffer += ':'; - if(this->global_header.getSampleNumber() > 1) outputBuffer += '['; - objects.format_containers[targetKeys[i]]->to_json_string(outputBuffer, position, 0); - // Cycle over samples - for(U64 s = 1; s < this->global_header.getSampleNumber(); ++s){ - outputBuffer += ','; - objects.format_containers[targetKeys[i]]->to_json_string(outputBuffer, position, s); - } - if(this->global_header.getSampleNumber() > 1) outputBuffer += ']'; +U64 VariantReader::OutputHtslibVcfLinear(void){ + this->variant_container.AllocateGenotypeMemory(); + + // Open a htslib file handle for the target output + // destination. + char hts_stream_type[2]; + hts_stream_type[0] = 'w'; hts_stream_type[1] = this->settings.output_type; + htsFile *fp = hts_open(this->settings.output.c_str(), hts_stream_type); + + // Convert the internal yon header to a bcf_hdr_t + // structure. + bcf_hdr_t* hdr = this->GetGlobalHeader().ConvertVcfHeader(!this->settings.drop_format); + if ( bcf_hdr_write(fp, hdr) != 0 ) { + std::cerr << "Failed to write header to " << this->settings.output << std::endl; + exit(1); + } + + // Initialize an empty record that we will keep + // reusing as we iterate over available yon records. + bcf1_t *rec = bcf_init1(); + + // Iterate over available blocks. + while(this->NextBlock()){ + // Lazy evaluate yon records. + objects_type* objects = this->GetCurrentContainer().LoadObjects(this->block_settings); + yon1_t* entries = this->GetCurrentContainer().LazyEvaluate(*objects); + + // Iterate over available records in this block. + for(U32 i = 0; i < objects->meta_container->size(); ++i){ + if(this->variant_filters.filter(entries[i], i) == false) + continue; + + entries[i].meta->UpdateHtslibVcfRecord(rec, hdr); + this->OutputHtslibVcfInfo(rec, hdr, entries[i]); + this->OutputHtslibVcfFormat(rec, hdr, entries[i]); + this->OutputHtslibVcfFilter(rec, hdr, entries[i]); + + if ( bcf_write1(fp, hdr, rec) != 0 ){ + std::cerr << "Failed to write record to " << this->settings.output; + exit(1); } + + bcf_clear1(rec); } - } -} -void VariantReader::printINFOVCF(buffer_type& outputBuffer, - const char& delimiter, - const U32& position, - const objects_type& objects) const -{ - // Check if any INFO data exists at all - if(this->variant_container.getBlock().footer.n_info_patterns == 0){ - return; + // Cleanup lazy evaluation of yon records. + delete [] entries; + delete objects; } - if(block_settings.info_all.display || objects.n_loaded_info){ - const std::vector& targetKeys = objects.local_match_keychain_info[objects.meta_container->at(position).info_pattern_id]; - if(targetKeys.size()){ - if(outputBuffer.back() != delimiter) outputBuffer += delimiter; - - // First - outputBuffer += objects.info_field_names[targetKeys[0]]; - if(objects.info_containers[targetKeys[0]]->emptyPosition(position) == false){ - outputBuffer += '='; - objects.info_containers[targetKeys[0]]->to_vcf_string(outputBuffer, position); - } + // Cleanup htslib bcf1_t and bcf_hdr_t structures. + bcf_destroy1(rec); + bcf_hdr_destroy(hdr); - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += ";"; - outputBuffer += objects.info_field_names[targetKeys[i]]; - if(this->global_header.info_fields[this->variant_container.getBlock().footer.info_offsets[targetKeys[i]].data_header.global_key].primitive_type == YON_VCF_HEADER_FLAG){ - continue; - } - if(objects.info_containers[targetKeys[i]]->emptyPosition(position)) continue; - outputBuffer += '='; - objects.info_containers[targetKeys[i]]->to_vcf_string(outputBuffer, position); - } - } else { - if(outputBuffer.back() != delimiter) outputBuffer += delimiter; - outputBuffer += '.'; - } + // Close file handle. + int ret; + if ( (ret=hts_close(fp)) ) { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",this->settings.output.data(),ret); + exit(ret); } -} -void VariantReader::printINFOVCF(buffer_type& outputBuffer, - const U32& position, - const objects_type& objects) const -{ - return(this->printINFOVCF(outputBuffer, '\t', position, objects)); + return 0; } -void VariantReader::printINFOCustom(buffer_type& outputBuffer, - const char& delimiter, - const U32& position, - const objects_type& objects) const -{ - if(block_settings.info_all.display || objects.n_loaded_info){ - const std::vector& targetKeys = objects.local_match_keychain_info[objects.meta_container->at(position).info_pattern_id]; - if(targetKeys.size()){ - if(outputBuffer.back() != delimiter) outputBuffer += delimiter; - - // Check if this target container is a FLAG - if(this->global_header.info_fields[this->variant_container.getBlock().info_containers[targetKeys[0]].header.getGlobalKey()].primitive_type == YON_VCF_HEADER_FLAG){ - outputBuffer += objects.info_field_names[targetKeys[0]]; - } else { - // Check if the positon is empty - if(objects.info_containers[targetKeys[0]]->emptyPosition(position) == false){ - objects.info_containers[targetKeys[0]]->to_vcf_string(outputBuffer, position); - } - } +U64 VariantReader::OutputHtslibVcfSearch(void){ + this->variant_container.AllocateGenotypeMemory(); - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += delimiter; - if(this->global_header.info_fields[this->variant_container.getBlock().info_containers[targetKeys[i]].header.getGlobalKey()].primitive_type == YON_VCF_HEADER_FLAG){ - outputBuffer +=objects.info_field_names[targetKeys[i]]; - continue; - } - if(objects.info_containers[targetKeys[i]]->emptyPosition(position)) continue; - objects.info_containers[targetKeys[i]]->to_vcf_string(outputBuffer, position); - } - } - } -} - -void VariantReader::printINFOCustomJSON(buffer_type& outputBuffer, - const char& delimiter, - const U32& position, - const objects_type& objects) const -{ - if(block_settings.info_all.display || objects.n_loaded_info){ - const std::vector& targetKeys = objects.local_match_keychain_info[objects.meta_container->at(position).info_pattern_id]; - if(targetKeys.size()){ - if(outputBuffer.back() != ',') outputBuffer += ','; - // Check if this target container is a FLAG - outputBuffer += "\"INFO-"; - outputBuffer += objects.info_field_names[targetKeys[0]]; - outputBuffer += "\":"; - if(this->global_header.info_fields[this->variant_container.getBlock().info_containers[targetKeys[0]].header.getGlobalKey()].primitive_type == YON_VCF_HEADER_FLAG){ - outputBuffer += "true"; - } else { - objects.info_containers[targetKeys[0]]->to_json_string(outputBuffer, position); - } + // Open a htslib file handle for the target output + // destination. + char hts_stream_type[2]; + hts_stream_type[0] = 'w'; hts_stream_type[1] = this->settings.output_type; + htsFile *fp = hts_open(this->settings.output.c_str(), hts_stream_type); - for(U32 i = 1; i < targetKeys.size(); ++i){ - outputBuffer += ','; - outputBuffer += "\"INFO-"; - outputBuffer += objects.info_field_names[targetKeys[i]]; - outputBuffer += "\":"; - if(this->global_header.info_fields[this->variant_container.getBlock().info_containers[targetKeys[i]].header.getGlobalKey()].primitive_type == YON_VCF_HEADER_FLAG){ - outputBuffer += "true"; - continue; - } - objects.info_containers[targetKeys[i]]->to_json_string(outputBuffer, position); - } - } + // Convert the internal yon header to a bcf_hdr_t + // structure. + bcf_hdr_t* hdr = this->GetGlobalHeader().ConvertVcfHeader(!this->settings.drop_format); + if ( bcf_hdr_write(fp, hdr) != 0 ) { + std::cerr << "Failed to write header to " << this->settings.output << std::endl; + exit(1); } -} -TACHYON_VARIANT_CLASSIFICATION_TYPE VariantReader::classifyVariant(const meta_entry_type& meta, const U32& allele) const{ - const S32 ref_size = meta.alleles[0].size(); - const S32 diff = ref_size - meta.alleles[allele].size(); - //std::cerr << diff << ","; - if(meta.alleles[0].allele[0] == '<' || meta.alleles[allele].allele[0] == '<') return(YON_VARIANT_CLASS_SV); - else if(diff == 0){ - if(ref_size == 1 && meta.alleles[0].allele[0] != meta.alleles[allele].allele[0]){ - if(meta.alleles[allele].allele[0] == 'A' || meta.alleles[allele].allele[0] == 'T' || meta.alleles[allele].allele[0] == 'G' || meta.alleles[allele].allele[0] == 'C') - return(YON_VARIANT_CLASS_SNP); - else return(YON_VARIANT_CLASS_UNKNOWN); - } - else if(ref_size != 1){ - U32 characters_identical = 0; - const U32 length_shortest = ref_size < meta.alleles[allele].size() ? ref_size : meta.alleles[allele].size(); + // Initialize an empty record that we will keep + // reusing as we iterate over available yon records. + bcf1_t *rec = bcf_init1(); - for(U32 c = 0; c < length_shortest; ++c){ - characters_identical += (meta.alleles[0].allele[c] == meta.alleles[allele].allele[c]); + // Filter functionality + filter_intervals_function filter_intervals = &self_type::FilterIntervals; + + // Iterate over available blocks. + while(this->NextBlock()){ + // Lazy evaluate yon records. + objects_type* objects = this->GetCurrentContainer().LoadObjects(this->block_settings); + yon1_t* entries = this->GetCurrentContainer().LazyEvaluate(*objects); + + // Iterate over available records in this block. + for(U32 i = 0; i < objects->meta_container->size(); ++i){ + if((this->*filter_intervals)(objects->meta_container->at(i)) == false) + continue; + + if(this->variant_filters.filter(entries[i], i) == false) + continue; + + entries[i].meta->UpdateHtslibVcfRecord(rec, hdr); + this->OutputHtslibVcfInfo(rec, hdr, entries[i]); + this->OutputHtslibVcfFormat(rec, hdr, entries[i]); + this->OutputHtslibVcfFilter(rec, hdr, entries[i]); + + if ( bcf_write1(fp, hdr, rec) != 0 ){ + std::cerr << "Failed to write record to " << this->settings.output; + exit(1); } - if(characters_identical == 0) return(YON_VARIANT_CLASS_MNP); - else return(YON_VARIANT_CLASS_CLUMPED); + bcf_clear1(rec); } - } else { - const U32 length_shortest = ref_size < meta.alleles[allele].size() ? ref_size : meta.alleles[allele].size(); - U32 characters_non_standard = 0; - for(U32 c = 0; c < length_shortest; ++c){ - characters_non_standard += (meta.alleles[allele].allele[c] != 'A' && meta.alleles[allele].allele[c] != 'T' && meta.alleles[allele].allele[c] != 'C' && meta.alleles[allele].allele[c] !='G'); - } - if(characters_non_standard) return(YON_VARIANT_CLASS_UNKNOWN); - else return(YON_VARIANT_CLASS_INDEL); - } - return(YON_VARIANT_CLASS_UNKNOWN); -} -/**< - * Outputs - * @return - */ -const U64 VariantReader::outputVCF(void){ - U64 n_variants = 0; - - if(this->block_settings.annotate_extra){ - // fixme - // if special - // "FS_A", "AN", "NM", "NPM", "AC", "AC_FW", "AC_REV", "AF", "HWE_P", "VT", "MULTI_ALLELIC", "F_PIC" - if(this->global_header.getInfoField("FS_A") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("AN") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("NM") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("NPM") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("AC") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("AC_FWD") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("AC_REV") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("HWE_P") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("VT") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("AF") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("MULTI_ALLELIC") == nullptr) this->global_header.literals += "\n##INFO="; - if(this->global_header.getInfoField("F_PIC") == nullptr) this->global_header.literals += "\n##INFO="; + // Cleanup lazy evaluation of yon records. + delete [] entries; + delete objects; } - this->global_header.literals += "\n##tachyon_viewVersion=" + tachyon::constants::PROGRAM_NAME + "-" + VERSION + ";"; - this->global_header.literals += "libraries=" + tachyon::constants::PROGRAM_NAME + '-' + tachyon::constants::TACHYON_LIB_VERSION + "," - + SSLeay_version(SSLEAY_VERSION) + "," + "ZSTD-" + ZSTD_versionString() + "; timestamp=" + utility::datetime(); + // Cleanup htslib bcf1_t and bcf_hdr_t structures. + bcf_destroy1(rec); + bcf_hdr_destroy(hdr); - this->global_header.literals += "\n##tachyon_viewCommand=" + tachyon::constants::LITERAL_COMMAND_LINE + '\n'; - this->global_header.literals += this->getSettings().get_settings_string(); - - // Output VCF header - if(this->block_settings.show_vcf_header){ - this->global_header.writeHeaderVCF(std::cout, this->block_settings.format_all.load || this->block_settings.format_list.size()); + // Close file handle. + int ret; + if ( (ret=hts_close(fp)) ) { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",this->settings.output.data(),ret); + exit(ret); } - // If seek is active for targetted intervals - if(this->interval_container.hasIntervals()){ - if(this->interval_container.build(this->global_header) == false) - return false; + return 0; +} - if(this->interval_container.getBlockList().size()){ - for(U32 i = 0; i < this->interval_container.getBlockList().size(); ++i){ - if(this->getBlock(this->interval_container.getBlockList()[i]) == false){ - return(0); +void VariantReader::OutputHtslibVcfInfo(bcf1_t* rec, bcf_hdr_t* hdr, yon1_t& entry) const{ + if(entry.n_info){ + const uint32_t n_info_avail = entry.info_ids->size(); + if(n_info_avail){ + for(U32 j = 0; j < n_info_avail; ++j){ + if(entry.info_hdr[j]->yon_type == YON_VCF_HEADER_FLAG){ + bcf_update_info_flag(hdr, rec, entry.info_hdr[j]->id.data(), NULL, 1); + } else { + entry.info[j]->UpdateHtslibVcfRecordInfo(rec, hdr, entry.info_hdr[j]->id); } - n_variants += this->outputBlockVCF(); } - return(n_variants); - } else { // Provided intervals but no matching YON blocks - return(0); + if(this->GetBlockSettings().annotate_extra){ + entry.EvaluateSummary(true); + entry.gt_sum->d->UpdateHtslibVcfRecord(rec, hdr); + } } - } - - // While there are YON blocks - while(this->nextBlock()) n_variants += this->outputBlockVCF(); - return(n_variants); -} - -/**< - * - * @return - */ -const U64 VariantReader::outputCustom(void){ - U64 n_variants = 0; - - // While there are YON blocks - while(this->nextBlock()) n_variants += this->outputBlockCustom(); - return(n_variants); -} - -/**< - * - * @return - */ -const U32 VariantReader::outputBlockVCF(void){ - objects_type objects; - this->loadObjects(objects); - - // Reserve memory for output buffer - // This is much faster than writing directly to ostream because of synchronisation - io::BasicBuffer output_buffer(256000); - if(this->block_settings.format_all.load) output_buffer.resize(256000 + this->global_header.getSampleNumber()*2); - - // Todo: in cases of non-diploid - std::vector genotypes_unpermuted(this->global_header.getSampleNumber()); - for(U32 i = 0; i < this->global_header.getSampleNumber(); ++i){ - genotypes_unpermuted[i].alleles = new core::GTObjectAllele; - } - - // Print functionality - print_format_function print_format = &self_type::printFORMATDummy; - if(this->block_settings.format_ID_list.size()) print_format = &self_type::printFORMATCustom; - else if(block_settings.format_all.display) print_format = &self_type::printFORMATVCF; - print_info_function print_info = &self_type::printINFOVCF; - print_meta_function print_meta = &utility::to_vcf_string; - print_filter_function print_filter = &self_type::printFILTER; - - // Filter functionality - filter_intervals_function filter_intervals = &self_type::filterIntervalsDummy; - if(this->interval_container.size()) filter_intervals = &self_type::filterIntervals; - - // Cycling over loaded meta objects - for(U32 p = 0; p < objects.meta_container->size(); ++p){ - if(this->variant_filters.filter(objects, p) == false) - continue; - - if((this->*filter_intervals)(objects.meta_container->at(p)) == false) - continue; - - - if(this->block_settings.custom_output_format) - utility::to_vcf_string(output_buffer, '\t', objects.meta_container->at(p), this->global_header, this->block_settings); - else - utility::to_vcf_string(output_buffer, '\t', objects.meta_container->at(p), this->global_header); - - // Filter options - (this->*print_filter)(output_buffer, p, objects); - (this->*print_info)(output_buffer, '\t', p, objects); - if(this->block_settings.annotate_extra) this->getGenotypeSummary(output_buffer, p, objects); // Todo: fixme - (this->*print_format)(output_buffer, '\t', p, objects, genotypes_unpermuted); - output_buffer += '\n'; - - if(output_buffer.size() > 65536){ - std::cout.write(output_buffer.data(), output_buffer.size()); - output_buffer.reset(); - std::cout.flush(); + } else { + if(this->GetBlockSettings().annotate_extra){ + entry.EvaluateSummary(true); + entry.gt_sum->d->UpdateHtslibVcfRecord(rec, hdr); } } - - std::cout.write(output_buffer.data(), output_buffer.size()); - output_buffer.reset(); - std::cout.flush(); - - return(objects.meta_container->size()); } -/**< - * - * @return - */ -const U32 VariantReader::outputBlockCustom(void){ - objects_type objects; - this->loadObjects(objects); - - // Reserve memory for output buffer - // This is much faster than writing directly to ostream because of syncing - io::BasicBuffer output_buffer(256000 + this->global_header.getSampleNumber()*2); - std::vector genotypes_unpermuted(this->global_header.getSampleNumber()); - - // Todo: move to function - //U32 info_match_limit = 1; // any match - //info_match_limit = this->block_settings.info_list.size(); // all match - - // Function pointer to use - print_format_function print_format = &self_type::printFORMATDummy; - print_info_function print_info = &self_type::printINFODummy; - print_meta_function print_meta = &utility::to_vcf_string; - print_filter_function print_filter = &self_type::printFILTERDummy; - if(block_settings.output_json) print_meta = &utility::to_json_string; - - if(block_settings.format_all.display || objects.n_loaded_format){ - if(block_settings.output_json){ - print_format = &self_type::printFORMATCustomVectorJSON; - } else { - if(block_settings.output_format_vector) print_format = &self_type::printFORMATCustomVector; - else print_format = &self_type::printFORMATCustom; +void VariantReader::OutputHtslibVcfFormat(bcf1_t* rec, bcf_hdr_t* hdr, const yon1_t& entry) const{ + if(entry.n_format){ + const uint32_t n_format_avail = entry.format_ids->size(); + if(n_format_avail){ + // Case when the only available FORMAT field is the GT field. + if(n_format_avail == 1 && entry.is_loaded_gt && + entry.meta->controller.gt_available && + (this->GetBlockSettings().display_static & YON_BLK_BV_GT)) + { + entry.gt->ExpandExternal(this->variant_container.GetAllocatedGenotypeMemory()); + entry.gt->d_exp = this->variant_container.GetAllocatedGenotypeMemory(); + entry.gt->UpdateHtslibGenotypes(rec, hdr); + entry.gt->d_exp = nullptr; + } + // Case when there are > 1 Vcf Format fields and the GT field + // is available. + else if(n_format_avail > 1 && entry.is_loaded_gt && + entry.meta->controller.gt_available && + (this->GetBlockSettings().display_static & YON_BLK_BV_GT)) + { + entry.gt->ExpandExternal(this->variant_container.GetAllocatedGenotypeMemory()); + entry.gt->d_exp = this->variant_container.GetAllocatedGenotypeMemory(); + + entry.gt->UpdateHtslibGenotypes(rec, hdr); + for(U32 g = 1; g < n_format_avail; ++g) + entry.format_containers[g]->UpdateHtslibVcfRecord(entry.id_block, rec, hdr, entry.format_hdr[g]->id); + + entry.gt->d_exp = nullptr; + } + // All other cases. + else { + for(U32 g = 0; g < n_format_avail; ++g) + entry.format_containers[g]->UpdateHtslibVcfRecord(entry.id_block, rec, hdr, entry.format_hdr[g]->id); + } } } +} - if(block_settings.info_all.display || objects.n_loaded_info){ - if(block_settings.output_json) print_info = &self_type::printINFOCustomJSON; - else print_info = &self_type::printINFOCustom; - } - - if(block_settings.display_filter){ - if(block_settings.output_json) print_filter = &self_type::printFILTERJSON; - else print_filter = &self_type::printFILTERCustom; - } - - U32 n_records_returned = 0; - - // Filter functionality - filter_intervals_function filter_intervals = &self_type::filterIntervalsDummy; - if(this->interval_container.size()) filter_intervals = &self_type::filterIntervals; - - - if(block_settings.output_json) output_buffer += "\"block\":["; - for(U32 position = 0; position < objects.meta_container->size(); ++position){ - if(this->variant_filters.filter(objects, position) == false) - continue; - - if((this->*filter_intervals)(objects.meta_container->at(position)) == false) - continue; - - //if(info_keep[objects.meta->at(p).getInfoPatternID()] < info_match_limit) - // continue; - - if(block_settings.output_json){ - if(position != 0) output_buffer += ",\n"; - output_buffer += "{"; - } - ++n_records_returned; - - (*print_meta)(output_buffer, this->block_settings.custom_delimiter_char, objects.meta_container->at(position), this->global_header, this->block_settings); - (this->*print_filter)(output_buffer, position, objects); - (this->*print_info)(output_buffer, this->block_settings.custom_delimiter_char, position, objects); - (this->*print_format)(output_buffer, this->block_settings.custom_delimiter_char, position, objects, genotypes_unpermuted); - - if(block_settings.output_json) output_buffer += "}"; - else output_buffer += '\n'; - //output_buffer += "}"; - - // Flush if buffer is large - if(output_buffer.size() > 65536){ - std::cout.write(output_buffer.data(), output_buffer.size()); - output_buffer.reset(); - std::cout.flush(); +void VariantReader::OutputHtslibVcfFilter(bcf1_t* rec, bcf_hdr_t* hdr, const yon1_t& entry) const{ + if(entry.n_filter){ + for(U32 k = 0; k < entry.filter_ids->size(); ++k){ + int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, entry.filter_hdr[k]->id.data()); + bcf_update_filter(hdr, rec, &tmpi, 1); } } - if(block_settings.output_json) output_buffer += "]"; - - // Flush buffer - std::cout.write(output_buffer.data(), output_buffer.size()); - output_buffer.reset(); - std::cout.flush(); - - return(n_records_returned); } } diff --git a/lib/variant_reader.h b/lib/variant_reader.h index 0ef035a..d78e0cb 100644 --- a/lib/variant_reader.h +++ b/lib/variant_reader.h @@ -22,7 +22,6 @@ #include "containers/variant_block.h" #include "containers/variant_block_container.h" #include "core/footer/footer.h" -#include "core/genotype_object.h" #include "core/header/variant_header.h" #include "core/variant_reader_filters.h" #include "core/variant_reader_objects.h" @@ -33,6 +32,7 @@ #include "math/fisher_math.h" #include "math/square_matrix.h" #include "utility/support_vcf.h" +#include "io/basic_reader.h" namespace tachyon{ @@ -40,8 +40,9 @@ class VariantReader{ private: typedef VariantReader self_type; typedef io::BasicBuffer buffer_type; - typedef core::VariantHeader header_type; + typedef VariantHeader header_type; typedef core::Footer footer_type; + typedef core::MetaEntry meta_entry_type; typedef algorithm::CompressionManager codec_manager_type; typedef DataBlockSettings block_settings_type; typedef VariantReaderSettings settings_type; @@ -49,27 +50,22 @@ class VariantReader{ typedef index::IndexEntry index_entry_type; typedef algorithm::VariantDigestManager checksum_type; typedef encryption::Keychain<> keychain_type; - typedef core::MetaEntry meta_entry_type; typedef VariantReaderObjects objects_type; typedef containers::VariantBlock block_entry_type; typedef containers::MetaContainer meta_container_type; typedef containers::GenotypeContainer gt_container_type; typedef containers::InfoContainerInterface info_interface_type; typedef containers::FormatContainerInterface format_interface_type; - typedef containers::GenotypeSummary genotype_summary_type; typedef containers::IntervalContainer interval_container_type; typedef containers::VariantBlockContainer variant_container_type; typedef VariantReaderFilters variant_filter_type; typedef algorithm::Interval interval_type; + typedef io::BasicReader basic_reader_type; + typedef encryption::EncryptionDecorator encryption_manager_type; - // Function pointers - typedef void (self_type::*print_format_function)(buffer_type& buffer, const char& delimiter, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const; - typedef void (self_type::*print_info_function)(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects) const; - typedef void (self_type::*print_filter_function)(buffer_type& outputBuffer, const U32& position, const objects_type& objects) const; + // Function pointer to interval slicing. typedef bool (self_type::*filter_intervals_function)(const meta_entry_type& meta_entry) const; - typedef buffer_type& (*print_meta_function)(buffer_type& buffer, const char& delimiter, const meta_entry_type& meta_entry, const header_type& header, const block_settings_type& controller); - public: VariantReader(); VariantReader(const std::string& filename); @@ -83,7 +79,8 @@ class VariantReader{ * has been invoked has no effect on the loaded `block` data. * @return A reference instance of the block settings object */ - inline block_settings_type& getBlockSettings(void){ return(this->block_settings); } + inline block_settings_type& GetBlockSettings(void){ return(this->block_settings); } + inline const block_settings_type& GetBlockSettings(void) const{ return(this->block_settings); } /**< * Retrieve current settings for the variant reader. This settings @@ -92,56 +89,27 @@ class VariantReader{ * parsing. * @return A reference instance of the settings object */ - inline settings_type& getSettings(void){ return(this->settings); } + inline settings_type& GetSettings(void){ return(this->settings); } + inline const settings_type& GetSettings(void) const{ return(this->settings); } + /**< * Retrieve the current filter settings for the variant reader. This * object controls the pointers to filter applied to each variant. * @return A reference instance of the filter object */ - inline variant_filter_type& getFilterSettings(void){ return(this->variant_filters); } + inline variant_filter_type& GetFilterSettings(void){ return(this->variant_filters); } // Basic accessors - inline header_type& getGlobalHeader(void){ return(this->global_header); } - inline const header_type& getGlobalHeader(void) const{ return(this->global_header); } - inline footer_type& getGlobalFooter(void){ return(this->global_footer); } - inline const footer_type& getGlobalFooter(void) const{ return(this->global_footer); } - inline index_type& getIndex(void){ return(this->index); } - inline const index_type& getIndex(void) const{ return(this->index); } - inline const size_t getFilesize(void) const{ return(this->filesize); } - inline variant_container_type& getCurrentBlock(void){ return(this->variant_container); } - inline const variant_container_type& getCurrentBlock(void) const{ return(this->variant_container); } - - /**< - * Checks if a FORMAT `field` is set in the header and then checks - * if that field exists in the current block. If it does return - * the GLOBAL key. If the field is not described in the header at - * all then return -2. - * @param field_name FORMAT field name to search for (e.g. "GL") - * @return Returns local key if found in this block. Returns -2 if not found in header, or -1 if found in header but not in block - */ - const int has_format_field(const std::string& field_name) const; - - /**< - * Checks if a INFO `field` is set in the header and then checks - * if that field exists in the current block. If it does return - * the GLOBAL key. If the field is not described in the header at - * all then return -2. - * @param field_name INFO field name to search for (e.g. "AC") - * @return Returns local key if found in this block. Returns -2 if not found in header, or -1 if found in header but not in block - */ - const int has_info_field(const std::string& field_name) const; - - /**< - * Checks if a FILTER `field` is set in the header and then checks - * if that field exists in the current block. If it does return - * the GLOBAL key. If the field is not described in the header at - * all then return -2. - * @param field_name FILTER field name to search for (e.g. "PASS") - * @return Returns local key if found in this block. Returns -2 if not found in header, or -1 if found in header but not in block - */ - const int has_filter_field(const std::string& field_name) const; - + inline header_type& GetGlobalHeader(void){ return(this->global_header); } + inline const header_type& GetGlobalHeader(void) const{ return(this->global_header); } + inline footer_type& GetGlobalFooter(void){ return(this->global_footer); } + inline const footer_type& GetGlobalFooter(void) const{ return(this->global_footer); } + inline index_type& GetIndex(void){ return(this->index); } + inline const index_type& GetIndex(void) const{ return(this->index); } + inline size_t GetFilesize(void) const{ return(this->basic_reader.filesize_); } + inline variant_container_type& GetCurrentContainer(void){ return(this->variant_container); } + inline const variant_container_type& GetCurrentContainer(void) const{ return(this->variant_container); } /**< * Opens a YON file. Performs all prerequisite @@ -157,7 +125,12 @@ class VariantReader{ * @return Returns TRUE upon success or FALSE otherwise */ inline bool open(const std::string& filename){ + this->basic_reader.filename_ = filename; this->settings.input = filename; + if(settings.keychain_file.size()){ + if(this->LoadKeychainFile() == false) + return false; + } return(this->open()); } @@ -175,14 +148,14 @@ class VariantReader{ * @param position * @return */ - bool seektoBlock(const U32 position); + bool SeektoBlock(const U32 position); /**< * Not implemented * @param chromosome_name * @return */ - bool seekToBlockChromosome(const std::string& chromosome_name); + bool SeekToBlockChromosome(const std::string& chromosome_name); /**< * Not implemented @@ -191,25 +164,19 @@ class VariantReader{ * @param to_bp_position * @return */ - bool seekToBlockChromosome(const std::string& chromosome_name, const U32 from_bp_position, const U32 to_bp_position); + bool SeekToBlockChromosome(const std::string& chromosome_name, const U32 from_bp_position, const U32 to_bp_position); /**< * Get the next YON block in-order * @return Returns TRUE if successful or FALSE otherwise */ - bool nextBlock(void); + bool NextBlock(void); /**< * Get the target YON block * @return Returns TRUE if successful or FALSE otherwise */ - bool getBlock(const index_entry_type& index_entry); - - /**< - * Get the current YON block in-order as a copy - * @return Returns a YON block container. The container has a size of 0 upon fail/empty - */ - variant_container_type getBlock(void); + bool GetBlock(const index_entry_type& index_entry); /**< @@ -220,78 +187,28 @@ class VariantReader{ * @param blockID * @return */ - bool seek_to_block(const U32& blockID); - - /**< - * Primary construction function for generating the appropriate instances of - * iterators / containers - * @param objects Target objects - * @return Returns reference to input target objects - */ - objects_type& loadObjects(objects_type& objects) const; - - /**< - * Wrapper function to call internal functions `outputCustom` or `outputBlockVCF`. - * Decides internally what function to invoke. - * @return - */ - const U64 outputVCF(void); - - /**< - * - * @return - */ - const U64 outputCustom(void); - - /**< - * - * @return - */ - const U32 outputBlockVCF(void); - - /**< - * - * @return - */ - const U32 outputBlockCustom(void); - - // Dummy functions as interfaces for function pointers - inline void printFILTERDummy(buffer_type& outputBuffer, const U32& position, const objects_type& objects) const{} - inline void printFORMATDummy(buffer_type& buffer, const char& delimiter, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const{} - inline void printINFODummy(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects) const{} + bool SeekBlock(const U32& blockID); + + U64 OutputRecords(void); + U64 OutputVcfLinear(void); + U64 OutputVcfSearch(void); + void OuputVcfWrapper(io::BasicBuffer& output_buffer, yon1_t& entry) const; + void OutputInfoVcf(io::BasicBuffer& output_buffer, yon1_t& entry) const; + void OutputFormatVcf(io::BasicBuffer& output_buffer, const yon1_t& entry) const; + void OutputFilterVcf(io::BasicBuffer& output_buffer, const yon1_t& entry) const; + + U64 OutputHtslibVcfLinear(void); + U64 OutputHtslibVcfSearch(void); + void OutputHtslibVcfInfo(bcf1_t* rec, bcf_hdr_t* hdr, yon1_t& entry) const; + void OutputHtslibVcfFormat(bcf1_t* rec, bcf_hdr_t* hdr, const yon1_t& entry) const; + void OutputHtslibVcfFilter(bcf1_t* rec, bcf_hdr_t* hdr, const yon1_t& entry) const; // Filter interval intersection and dummy version - inline bool filterIntervalsDummy(const meta_entry_type& meta_entry) const{ return true; } - inline bool filterIntervals(const meta_entry_type& meta_entry) const{ return(this->interval_container.find_overlaps(meta_entry).size()); } - - // FILTER functions - void printFILTER(buffer_type& outputBuffer, const U32& position, const objects_type& objects) const; - void printFILTERCustom(buffer_type& outputBuffer, const U32& position, const objects_type& objects) const; - void printFILTERJSON(buffer_type& outputBuffer, const U32& position, const objects_type& objects) const; - - // FORMAT functions - void printFORMATVCF(buffer_type& buffer, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const; - void printFORMATVCF(buffer_type& buffer, const char& delimiter, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const; - void printFORMATCustom(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const; - void printFORMATCustomVector(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const; - void printFORMATCustomVectorJSON(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects, std::vector& genotypes_unpermuted) const; - - // INFO functions - void printINFOVCF(buffer_type& outputBuffer, const U32& position, const objects_type& objects) const; - void printINFOVCF(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects) const; - void printINFOCustom(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects) const; - void printINFOCustomJSON(buffer_type& outputBuffer, const char& delimiter, const U32& position, const objects_type& objects) const; + inline bool FilterIntervalsDummy(const meta_entry_type& meta_entry) const{ return true; } + inline bool FilterIntervals(const meta_entry_type& meta_entry) const{ return(this->interval_container.FindOverlaps(meta_entry).size()); } // Calculations - TACHYON_VARIANT_CLASSIFICATION_TYPE classifyVariant(const meta_entry_type& meta, const U32& allele) const; - - /**< - * Not implemented - * Return bit-mask primitive of variant classifications detected - * @param meta Input meta entry for a site - * @return Returns a primitive interpreted as a boolean presence/absence bit-mask - */ - BYTE classifyVariant(const meta_entry_type& meta) const; + TACHYON_VARIANT_CLASSIFICATION_TYPE ClassifyVariant(const meta_entry_type& meta, const U32& allele) const; /**< * Parse interval strings. These strings have to match the regular expression @@ -299,12 +216,17 @@ class VariantReader{ * YON_REGEX_CONTIG_ONLY, YON_REGEX_CONTIG_POSITION, or YON_REGEX_CONTIG_RANGE * @return Returns TRUE if successful or FALSE otherwise */ - inline const bool addIntervals(std::vector& interval_strings){ - return(this->interval_container.parseIntervals(interval_strings, this->global_header, this->index)); + inline bool AddIntervals(std::vector& interval_strings){ + return(this->interval_container.ParseIntervals(interval_strings, this->global_header, this->index)); } - bool loadKeychainFile(const std::string& path){ + /**< + * + * @param path + * @return + */ + bool LoadKeychainFile(void){ std::ifstream keychain_reader(settings.keychain_file, std::ios::binary | std::ios::in); if(!keychain_reader.good()){ std::cerr << tachyon::utility::timestamp("ERROR") << "Failed to open keychain: " << settings.keychain_file << "..." << std::endl; @@ -319,318 +241,46 @@ class VariantReader{ return true; } - void printHeaderVCF(std::ostream& stream = std::cout){ - this->global_header.literals += "\n##tachyon_viewVersion=" + tachyon::constants::PROGRAM_NAME + "-" + VERSION + ";"; - this->global_header.literals += "libraries=" + tachyon::constants::PROGRAM_NAME + '-' + tachyon::constants::TACHYON_LIB_VERSION + "," + /**< + * + * @param stream + */ + void PrintHeaderVCF(std::ostream& stream = std::cout){ + this->global_header.literals_ += "##tachyon_viewVersion=" + tachyon::constants::PROGRAM_NAME + "-" + VERSION + ";"; + this->global_header.literals_ += "libraries=" + tachyon::constants::PROGRAM_NAME + '-' + tachyon::constants::TACHYON_LIB_VERSION + "," + SSLeay_version(SSLEAY_VERSION) + "," + "ZSTD-" + ZSTD_versionString() - + "; timestamp=" + tachyon::utility::datetime(); - - this->global_header.literals += "\n##tachyon_viewCommand=" + tachyon::constants::LITERAL_COMMAND_LINE + "\n"; - this->global_header.literals += this->getSettings().get_settings_string(); - - stream << this->global_header.literals << std::endl; - this->global_header.writeHeaderVCF(stream, true); - } - - //<----------------- EXAMPLE FUNCTIONS --------------------------> + + "; timestamp=" + tachyon::utility::datetime() + "\n"; + this->global_header.literals_ += "##tachyon_viewCommand=" + tachyon::constants::LITERAL_COMMAND_LINE + "\n"; + this->global_header.literals_ += this->GetSettings().get_settings_string(); + this->global_header.literals_ += '\n'; - U64 timings_meta(){ - containers::MetaContainer meta(this->variant_container.getBlock()); - buffer_type temp(meta.size() * 1000); - - for(U32 p = 0; p < meta.size(); ++p){ - utility::to_vcf_string(temp, this->block_settings.custom_delimiter_char, meta[p], this->global_header); - //utility::to_vcf_string(std::cout, meta[p], this->global_header); - temp += '\n'; - //if(temp.size() > 65536){ - // std::cout.write(temp.data(), temp.size()); - // temp.reset(); - //} - } - std::cout.write(temp.data(), temp.size()); - return(meta.size()); + this->global_header.PrintVcfHeader(stream); } - U64 iterate_genotypes(std::ostream& stream = std::cout){ - containers::MetaContainer meta(this->variant_container.getBlock()); - containers::GenotypeContainer gt(this->variant_container.getBlock(), meta); - - for(U32 i = 0; i < gt.size(); ++i){ - // All of these functions are in relative terms very expensive! - // Avoid using them unless you absolutely have to! - // Vector of literal genotype representations (lower level) - //std::vector objects = gt[i].getLiteralObjects(); - // Vector of genotype objects (high level permuted) - //std::vector objects_all = gt[i].getObjects(this->global_header.getSampleNumber()); - // Vector of genotype objects (high level unpermuted - original) - std::vector objects_true = gt[i].getObjects(this->global_header.getSampleNumber(), this->variant_container.getBlock().ppa_manager); - - std::cout << (int)objects_true[i].alleles[0].allele << (objects_true[i].alleles[1].phase ? '/' : '|') << (int)objects_true[i].alleles[1].allele; - for(U32 i = 1; i < objects_true.size(); ++i){ - std::cout << '\t' << (int)objects_true[i].alleles[0].allele << (objects_true[i].alleles[1].phase ? '/' : '|') << (int)objects_true[i].alleles[1].allele; - } - std::cout << std::endl; - } - return(gt.size()); - } - - U64 calculateIBS(math::SquareMatrix& square, math::SquareMatrix& square_temporary){ - algorithm::Timer timer; - timer.Start(); - - containers::MetaContainer meta(this->variant_container.getBlock()); - containers::GenotypeContainer gt(this->variant_container.getBlock(), meta); - for(U32 i = 0; i < gt.size(); ++i) - gt[i].comparePairwise(square_temporary); - - //square /= (U64)2*this->global_header.getSampleNumber()*gt.size(); - square.addUpperTriagonal(square_temporary, this->variant_container.getBlock().ppa_manager); - square_temporary.clear(); - - // 2 * (Upper triagonal + diagonal) * number of variants - const U64 updates = 2*((this->global_header.getSampleNumber()*this->global_header.getSampleNumber() - this->global_header.getSampleNumber())/2 + this->global_header.getSampleNumber()) * gt.size(); - std::cerr << utility::timestamp("DEBUG") << "Updates: " << utility::ToPrettyString(updates) << '\t' << timer.ElapsedString() << '\t' << utility::ToPrettyString((U64)((double)updates/timer.Elapsed().count())) << "/s" << std::endl; - return((U64)2*this->global_header.getSampleNumber()*gt.size()); - } - - U64 getTiTVRatios(std::ostream& stream, std::vector& global){ - containers::MetaContainer meta(this->variant_container.getBlock()); - containers::GenotypeContainer gt(this->variant_container.getBlock(), meta); - - std::vector objects(this->global_header.getSampleNumber()); - for(U32 i = 0; i < gt.size(); ++i) - gt[i].getTsTv(objects); - - for(U32 i = 0; i < objects.size(); ++i) - global[this->variant_container.getBlock().ppa_manager[i]] += objects[i]; - - return(gt.size()); - } - - std::vector calculateStrandBiasAlleles(const meta_entry_type& meta, const genotype_summary_type& genotype_summary, const bool phred_scale = true) const{ - std::vector strand_bias_p_values(meta.n_alleles); - double fisher_left_p, fisher_right_p, fisher_twosided_p; - - kt_fisher_exact( - genotype_summary.vectorA_[2], // A: Allele on forward strand - genotype_summary.vectorB_[2], // B: Allele on reverse strand - genotype_summary.alleleCountA() - (genotype_summary.vectorA_[2]), // C: Not allele on forward strand - genotype_summary.alleleCountB() - (genotype_summary.vectorB_[2]), // D: Not allele on reverse strand - &fisher_left_p, &fisher_right_p, &fisher_twosided_p); - - if(phred_scale) strand_bias_p_values[0] = std::abs(-10 * log10(fisher_twosided_p)); - else strand_bias_p_values[0] = fisher_twosided_p; - - // If n_alleles = 2 then they are identical because of symmetry - if(meta.n_alleles > 2){ - for(U32 p = 1; p < meta.n_alleles; ++p){ - kt_fisher_exact( - genotype_summary.vectorA_[2+p], // A: Allele on forward strand - genotype_summary.vectorB_[2+p], // B: Allele on reverse strand - genotype_summary.alleleCountA() - (genotype_summary.vectorA_[2+p]), // C: Not allele on forward strand - genotype_summary.alleleCountB() - (genotype_summary.vectorB_[2+p]), // D: Not allele on reverse strand - &fisher_left_p, &fisher_right_p, &fisher_twosided_p); - - if(phred_scale) strand_bias_p_values[p] = std::abs(-10 * log10(fisher_twosided_p)); - else strand_bias_p_values[p] = fisher_twosided_p; - } - } - return(strand_bias_p_values); - } - - void getGenotypeSummary(buffer_type& buffer, const U32& position, objects_type& objects) const{ - if(this->block_settings.alleles.load == false || this->block_settings.genotypes_all.load == false || this->block_settings.controller.load == false || this->block_settings.set_membership.load == false){ - std::cerr << utility::timestamp("ERROR") << "Cannot run function without loading: SET-MEMBERSHIP, GT, REF or ALT, CONTIG or POSITION..." << std::endl; - return; - } - - if(buffer.back() != ';' && buffer.back() != '\t') buffer += ';'; - - //objects_type objects; - //this->loadObjects(objects); - //U32 n_variants_parsed = 0; - - //for(U32 i = 0; i < objects.genotypes->size(); ++i){ - if(objects.meta_container->at(position).isDiploid() == false){ - std::cerr << "is not diploid" << std::endl; - return; - } - - // If set membership is -1 then calculate all fields - // Set target FLAG set to all ones; update with actual values if they exist - U16 target_flag_set = 65535; - if(objects.meta_container->at(position).getInfoPatternID() != -1) - target_flag_set = objects.additional_info_execute_flag_set[objects.meta_container->at(position).getInfoPatternID()]; - - // Get genotype summary data - objects.genotype_container->at(position).getSummary(*objects.genotype_summary); - std::vector hwe_p = objects.genotype_summary->calculateHardyWeinberg(objects.meta_container->at(position)); - std::vector af = objects.genotype_summary->calculateAlleleFrequency(objects.meta_container->at(position)); - - - //utility::to_vcf_string(stream, this->block_settings.custom_delimiter_char, meta, this->global_header); - - if(target_flag_set & 1){ - std::vector allele_bias = this->calculateStrandBiasAlleles(objects.meta_container->at(position), *objects.genotype_summary, true); - buffer += "FS_A="; - buffer.AddReadble(allele_bias[0]); - for(U32 p = 1; p < allele_bias.size(); ++p){ - buffer += ','; - buffer.AddReadble(allele_bias[p]); - } - } - - if(target_flag_set & 2){ - buffer += ";AN="; - buffer.AddReadble(objects.genotype_summary->alleleCount()); - } - - if(target_flag_set & 4){ - if(objects.genotype_summary->vectorA_[1] + objects.genotype_summary->vectorB_[1]){ - buffer += ";NM="; - buffer.AddReadble(objects.genotype_summary->vectorA_[1] + objects.genotype_summary->vectorB_[1]); - } - } - - if(target_flag_set & 8){ - if(objects.genotype_summary->vectorA_[0] + objects.genotype_summary->vectorB_[0]){ - buffer += ";NPM="; - buffer.AddReadble(objects.genotype_summary->vectorA_[0] + objects.genotype_summary->vectorB_[0]); - } - } - - if(target_flag_set & 16){ - buffer += ";AC="; - buffer.AddReadble(objects.genotype_summary->vectorA_[2] + objects.genotype_summary->vectorB_[2]); - for(U32 p = 1; p < objects.meta_container->at(position).n_alleles; ++p){ - buffer += ","; - buffer.AddReadble(objects.genotype_summary->vectorA_[2+p] + objects.genotype_summary->vectorB_[2+p]); - } - } - - if(target_flag_set & 32){ - buffer += ";AC_FWD="; - buffer.AddReadble(objects.genotype_summary->vectorA_[2]); - for(U32 p = 1; p < objects.meta_container->at(position).n_alleles; ++p){ - buffer += ","; - buffer.AddReadble(objects.genotype_summary->vectorA_[2+p]); - } - } - - if(target_flag_set & 64){ - buffer += ";AC_REV="; - buffer.AddReadble(objects.genotype_summary->vectorB_[2]); - for(U32 p = 1; p < objects.meta_container->at(position).n_alleles; ++p){ - buffer += ","; - buffer.AddReadble(objects.genotype_summary->vectorB_[2+p]); - } - } - - if(target_flag_set & 128){ - buffer += ";AF="; - buffer.AddReadble(af[0]); - for(U32 p = 1; p < af.size(); ++p){ - buffer += ","; - buffer.AddReadble(af[p]); - } - } + //<----------------- EXAMPLE FUNCTIONS --------------------------> - if(target_flag_set & 256){ - buffer += ";HWE_P="; - buffer.AddReadble(hwe_p[0]); - for(U32 p = 1; p < hwe_p.size(); ++p){ - buffer += ","; - buffer.AddReadble(hwe_p[p]); - } - } + /* if(target_flag_set & 512){ // Classify buffer += ";VT="; - buffer += TACHYON_VARIANT_CLASSIFICATION_STRING[this->classifyVariant(objects.meta_container->at(position), 1)]; + buffer += TACHYON_VARIANT_CLASSIFICATION_STRING[this->ClassifyVariant(objects.meta_container->at(position), 1)]; for(U32 p = 2; p < objects.meta_container->at(position).n_alleles; ++p){ buffer += ','; - buffer += TACHYON_VARIANT_CLASSIFICATION_STRING[this->classifyVariant(objects.meta_container->at(position), p)]; + buffer += TACHYON_VARIANT_CLASSIFICATION_STRING[this->ClassifyVariant(objects.meta_container->at(position), p)]; } } + */ - if(target_flag_set & 1024){ - if(objects.meta_container->at(position).n_alleles > 2) buffer += ";MULTI_ALLELIC"; - } - - // Population inbreeding coefficient: F = (Hexp - Hobs) /Hexp - if(target_flag_set & 2048){ - // Allele frequency of A - const double p = ((double)2*objects.genotype_summary->matrix_[2][2] + objects.genotype_summary->matrix_[2][3] + objects.genotype_summary->matrix_[3][2]) / (2*objects.genotype_summary->genotypeCount()); - // Genotype frequency of heterozyotes - const double pg = ((double)objects.genotype_summary->matrix_[2][3] + objects.genotype_summary->matrix_[3][2]) / objects.genotype_summary->genotypeCount(); - // Expected heterozygosity - const double exp = 2*p*(1-p); - // Population inbreeding coefficient: F - const double f_pic = exp > 0 ? (exp-pg)/exp : 0; - buffer += ";F_PIC="; - buffer.AddReadble(f_pic); - } - - objects.genotype_summary->clear(); - } - - U64 countVariants(std::ostream& stream = std::cout){ - containers::MetaContainer meta(this->variant_container.getBlock()); - return(meta.size()); - } - - U64 iterateMeta(std::ostream& stream = std::cout){ - containers::MetaContainer meta(this->variant_container.getBlock()); - containers::GenotypeContainer gt(this->variant_container.getBlock(), meta); - containers::GenotypeSummary gt_summary; - for(U32 i = 0; i < gt.size(); ++i){ - // If there's > 5 alleles continue - if(gt[i].getMeta().getNumberAlleles() >= 5) continue; - // Calculate summary statistics - //gt[i].getSummary(gt_summary); - - // Calculate total number of alt-alleles (allele 1, where 0 is ref) - //std::cerr << gt_summary << '\n'; - gt_summary.clear(); // Recycle summary object - } - //std::cerr << std::endl; - //std::cerr << gt.size() << std::endl; - return(gt.size()); - //std::cerr << gt[0] << std::endl;; - - //return true; - - core::HeaderMapEntry* entry = nullptr; - if(this->global_header.getInfoField("AF", entry)){ - containers::InfoContainer it_i(this->variant_container.getBlock().info_containers[1]); - //math::MathSummaryStatistics stats = it_i.getSummaryStatistics(); - //std::cerr << stats.n_total << '\t' << stats.mean << '\t' << stats.standard_deviation << '\t' << stats.min << "-" << stats.max << std::endl; - for(U32 i = 0; i < it_i.size(); ++i){ - //if(it_i[i].size() < 3) continue; - //it[i].toVCFString(stream, this->global_header, this->variant_container.getBlock().index_entry.contigID, this->variant_container.getBlock().index_entry.minPosition); - - //stream << (int)it_i[i][0]; - for(U32 j = 0; j < it_i[i].size(); ++j) - stream << it_i[i][j] << ' '; - } - stream << '\n'; - } - return(0); - } -protected: - std::ifstream stream; - size_t filesize; - // Actual data +private: + basic_reader_type basic_reader; variant_container_type variant_container; - - // Supportive objects block_settings_type block_settings; settings_type settings; variant_filter_type variant_filters; @@ -641,6 +291,7 @@ class VariantReader{ codec_manager_type codec_manager; keychain_type keychain; interval_container_type interval_container; + yon_occ occ_table; }; } diff --git a/lib/view.h b/lib/view.h index f0f9e58..4c9b92e 100644 --- a/lib/view.h +++ b/lib/view.h @@ -29,36 +29,36 @@ DEALINGS IN THE SOFTWARE. #include "utility.h" #include "variant_reader.h" +#include "core/occ.h" + void view_usage(void){ programMessage(true); std::cerr << - "About: Convert YON->VCF/BCF or custom output; provides subset and slice operators data\n" + "About: Convert YON->VCF/BCF; provides subsetting and slicing functionality\n" "Usage: " << tachyon::constants::PROGRAM_NAME << " view [options] -i \n\n" "Options:\n" " -i FILE input YON file (required)\n" - " -o FILE output file (- for stdout; default: -)\n" - " -k FILE keychain with encryption keys (required if encrypted)\n" - " -O STRING output format: can be either JSON,VCF,BCF, or CUSTOM (-c must be triggered)\n" + " -o FILE output file (- for stdout)[-]\n" + " -k FILE keychain file with encryption keys (required if the file is encrypted)\n" + " -O y: tachyon archive, b: compressed BCF, u: uncompressed BCF, \n" + " z: compressed VCF, v: uncompressed VCF [v]\n" " -f STRING interpreted filter string for slicing output (see manual)\n" " -r STRING interval string\n" - " -R STRING path to file with interval strings\n" - " -d CHAR output delimiter (-c must be triggered)\n" - " -y custom output format (ignores VCF/BCF specification rules)\n" - " -V custom output data as vectors instead of per sample (valid only with -y)\n" + //" -R STRING path to file with interval strings\n" " -G drop all FORMAT fields from output\n" - " -h/H header only / no header\n" - " -s Hide all program messages\n\n" + " -X annotate FORMAT:GT data and add these statistics to the INFO column\n" + " -h/H header only / no header\n\n" - "Subset options:\n" - " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n" - " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n\n" + //"Subset options:\n" + //" -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n" + //" -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n\n" "Filter options:\n" " -a/A, --ref-match/--alt-match regular expression pattern for the reference allele -a or for any alternative alleles -A\n" " -n, --name-match regular expression pattern for the locus name\n" " -c/C, --min-ac/--max-ac minimum/maximum count for non-reference least frequent\n" " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n" - " -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n" + //" -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n" " -z/Z, --known/--novel select known/novel sites only (ID is not/is '.')\n" " -q/Q, --min-quality/--max-quality minimum/maximum quality value\n" " -m/M, --min-alleles/--max-alleles minimum/maximum number of alleles listed in REF and ALT\n" @@ -68,14 +68,13 @@ void view_usage(void){ " -l/L, --min-af/--max-af minimum/maximum frequency for non-reference least frequent\n" " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n" " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n" - " -e/E, --remove-unseen/--keep-unseen select/exclude sites with unseen alternative allele(s)\n" - " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n\n"; + " -e/E, --remove-unseen/--keep-unseen select/exclude sites with unseen alternative allele(s)\n\n"; + //" -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n\n"; } int view(int argc, char** argv){ if(argc < 2){ - programMessage(); - programHelpDetailed(); + programHelp(); return(1); } @@ -87,43 +86,40 @@ int view(int argc, char** argv){ int option_index = 0; static struct option long_options[] = { - {"input", required_argument, 0, 'i' }, - {"output", optional_argument, 0, 'o' }, - {"keychain", optional_argument, 0, 'k' }, - {"filter", optional_argument, 0, 'f' }, - {"delimiter", optional_argument, 0, 'd' }, - {"output-type", optional_argument, 0, 'O' }, - {"vector-output", no_argument, 0, 'V' }, - {"annotate-genotype", no_argument, 0, 'X' }, - {"region", optional_argument, 0, 'r' }, - {"noHeader", no_argument, 0, 'H' }, - {"onlyHeader", no_argument, 0, 'h' }, - {"dropFormat", no_argument, 0, 'G' }, - {"customFormat", no_argument, 0, 'y' }, - {"silent", no_argument, 0, 's' }, - {"af-min", optional_argument, 0, 'l' }, - {"af-max", optional_argument, 0, 'L' }, - {"ac-min", optional_argument, 0, 'c' }, - {"ac-max", optional_argument, 0, 'C' }, - {"alleles-min", optional_argument, 0, 'm' }, - {"alleles-max", optional_argument, 0, 'M' }, - {"known", no_argument, 0, 'z' }, - {"novel", no_argument, 0, 'Z' }, - {"phased", no_argument, 0, 'p' }, - {"exclude-phased",no_argument, 0, 'P' }, - {"mixed-phase", no_argument, 0, 'j' }, - {"no-mixed-phase", no_argument, 0, 'J' }, - {"uncalled", no_argument, 0, 'u' }, - {"exclude-uncalled", no_argument, 0, 'U' }, - {"ref-match", optional_argument, 0, 'a' }, - {"alt-match", optional_argument, 0, 'A' }, - {"name-match", optional_argument, 0, 'n' }, - {"mixed-ploidy",no_argument, 0, 'w' }, - {"no-mixed-ploidy",no_argument, 0, 'W' }, - {"remove-unseen",no_argument, 0, 'e' }, - {"keep-unseen",no_argument, 0, 'E' }, - {"min-quality",optional_argument, 0, 'q' }, - {"max-quality",optional_argument, 0, 'Q' }, + {"input", required_argument, 0, 'i' }, + {"output", optional_argument, 0, 'o' }, + {"keychain", optional_argument, 0, 'k' }, + {"filter", optional_argument, 0, 'f' }, + {"output-type", optional_argument, 0, 'O' }, + {"annotate-genotype", no_argument, 0, 'X' }, + {"region", optional_argument, 0, 'r' }, + {"noHeader", no_argument, 0, 'H' }, + {"onlyHeader", no_argument, 0, 'h' }, + {"dropFormat", no_argument, 0, 'G' }, + {"silent", no_argument, 0, 's' }, + {"af-min", optional_argument, 0, 'l' }, + {"af-max", optional_argument, 0, 'L' }, + {"ac-min", optional_argument, 0, 'c' }, + {"ac-max", optional_argument, 0, 'C' }, + {"alleles-min", optional_argument, 0, 'm' }, + {"alleles-max", optional_argument, 0, 'M' }, + {"known", no_argument, 0, 'z' }, + {"novel", no_argument, 0, 'Z' }, + {"phased", no_argument, 0, 'p' }, + {"exclude-phased" ,no_argument, 0, 'P' }, + {"mixed-phase", no_argument, 0, 'j' }, + {"no-mixed-phase", no_argument, 0, 'J' }, + {"uncalled", no_argument, 0, 'u' }, + {"exclude-uncalled", no_argument, 0, 'U' }, + {"ref-match", optional_argument, 0, 'a' }, + {"alt-match", optional_argument, 0, 'A' }, + {"name-match", optional_argument, 0, 'n' }, + {"mixed-ploidy", no_argument, 0, 'w' }, + {"no-mixed-ploidy", no_argument, 0, 'W' }, + {"remove-unseen", no_argument, 0, 'e' }, + {"keep-unseen", no_argument, 0, 'E' }, + {"min-quality", optional_argument, 0, 'q' }, + {"max-quality", optional_argument, 0, 'Q' }, {0,0,0,0} }; @@ -135,9 +131,9 @@ int view(int argc, char** argv){ SILENT = 0; std::string temp; tachyon::VariantReader reader; - tachyon::VariantReaderFilters& filters = reader.getFilterSettings(); + tachyon::VariantReaderFilters& filters = reader.GetFilterSettings(); - while ((c = getopt_long(argc, argv, "i:o:k:f:d:O:r:yGshHVX?l:L:m:M:pPuUc:C:jJzZa:A:n:wWeEq:Q:", long_options, &option_index)) != -1){ + while ((c = getopt_long(argc, argv, "i:o:k:f:O:r:GshHX?l:L:m:M:pPuUc:C:jJzZa:A:n:wWeEq:Q:", long_options, &option_index)) != -1){ switch (c){ case 0: std::cerr << "Case 0: " << option_index << '\t' << long_options[option_index].name << std::endl; @@ -231,40 +227,17 @@ int view(int argc, char** argv){ case 's': SILENT = 1; break; - case 'y': - settings.custom_output_format = true; - break; case 'r': interval_strings.push_back(std::string(optarg)); break; - case 'd': - settings.custom_delimiter = true; - temp = std::string(optarg); - if(temp.size() != 1 && !(temp[0] == '\\' && temp.size() == 2)){ - std::cerr << "not a legal delimiter" << std::endl; - return(1); - } - if(temp.size() == 1) settings.custom_delimiter_char = temp[0]; - else { - if(temp[1] == 't') settings.custom_delimiter_char = '\t'; - else if(temp[1] == 'n') settings.custom_delimiter_char = '\n'; - else { - std::cerr << "not a legal delimiter" << std::endl; - return(1); - } - } - break; case 'h': settings.header_only = true; break; case 'H': settings.show_header = false; break; - case 'V': - settings.output_FORMAT_as_vector = true; - break; case 'O': - settings.output_type = std::string(optarg); + settings.output_type = optarg[0]; break; case 'X': settings.annotate_genotypes = true; @@ -279,7 +252,6 @@ int view(int argc, char** argv){ break; default: - std::cerr << "here default" << std::endl; std::cerr << tachyon::utility::timestamp("ERROR") << "Unrecognized option: " << (char)c << std::endl; return(1); } @@ -290,132 +262,75 @@ int view(int argc, char** argv){ return(1); } - // Print messages - /* - if(!SILENT){ - programMessage(); - std::cerr << tachyon::utility::timestamp("LOG") << "Calling view..." << std::endl; - } - */ - - - reader.getSettings() = settings; - - // temp - if(settings.keychain_file.size()){ - if(reader.loadKeychainFile(settings.keychain_file) == false) return 1; - } - if(!reader.open(settings.input)){ std::cerr << tachyon::utility::timestamp("ERROR") << "Failed to open file: " << settings.input << "..." << std::endl; return 1; } if(settings.header_only){ - reader.printHeaderVCF(); + reader.PrintHeaderVCF(); return(0); } // User provided '-f' string(s) if(interpret_commands.size()){ - if(!reader.getBlockSettings().parseCommandString(interpret_commands, reader.getGlobalHeader(), settings.custom_output_format)){ + if(!reader.GetBlockSettings().ParseCommandString(interpret_commands, reader.GetGlobalHeader())){ std::cerr << tachyon::utility::timestamp("ERROR") << "Failed to parse command..." << std::endl; return(1); } } else { - reader.getBlockSettings().loadAll(true); + reader.GetBlockSettings().LoadAll(true); if(settings.drop_format){ - reader.getBlockSettings().loadGenotypes(false); - reader.getBlockSettings().ppa(false, false); - reader.getBlockSettings().format_all(false, false); + reader.GetBlockSettings().LoadGenotypes(false).LoadDisplayWrapper(false, YON_BLK_BV_PPA).LoadDisplayWrapper(false, YON_BLK_BV_FORMAT); } } - if(settings.custom_delimiter){ - if(settings.custom_output_format == false){ - std::cerr << tachyon::utility::timestamp("ERROR") << "Have to trigger -y when using a custom separator" << std::endl; - return(1); - } - reader.getBlockSettings().setCustomDelimiter(settings.custom_delimiter_char); + if(settings.output_type == 'v'){ + settings.use_htslib = false; + } else if(settings.output_type == 'z'){ + settings.output_type = 'z'; + settings.use_htslib = true; + } else if(settings.output_type == 'b'){ + settings.output_type = 'b'; + settings.use_htslib = true; + } else if(settings.output_type == 'u'){ + settings.output_type = 'u'; + settings.use_htslib = true; + } else if(settings.output_type == 'y'){ + std::cerr << "not supported yet" << std::endl; + return(1); + } else { + std::cerr << tachyon::utility::timestamp("ERROR") << "Unrecognised output option: " << settings.output_type << "..." << std::endl; + return(1); } - reader.getBlockSettings().output_format_vector = settings.output_FORMAT_as_vector; - - if(settings.output_type.size()){ - std::transform(settings.output_type.begin(), settings.output_type.end(), settings.output_type.begin(), ::toupper); // transform to UPPERCASE - if(strncmp(&settings.output_type[0], "JSON", 4) == 0 && settings.output_type.size() == 4){ - if(settings.custom_delimiter) - std::cerr << tachyon::utility::timestamp("WARNING") << "Custom output delimiter is incompatible with JSON. Disabled..." << std::endl; - - settings.custom_output_format = true; - reader.getBlockSettings().custom_output_format = true; - reader.getBlockSettings().output_json = true; - reader.getBlockSettings().output_format_vector = true; - } else if(strncmp(&settings.output_type[0], "VCF", 3) == 0 && settings.output_type.size() == 3){ - if(settings.custom_delimiter) - std::cerr << tachyon::utility::timestamp("WARNING") << "Custom output delimiter is incompatible with VCF. Disabled..." << std::endl; - - reader.getBlockSettings().custom_output_format = false; - reader.getBlockSettings().custom_delimiter = false; - reader.getBlockSettings().custom_delimiter_char = '\t'; - - if(settings.output_FORMAT_as_vector) - std::cerr << tachyon::utility::timestamp("WARNING") << "Output FORMAT as vectors (-V) is incompatible with VCF output. Disabled..." << std::endl; - - reader.getBlockSettings().output_format_vector = false; - } else if(strncmp(&settings.output_type[0], "BCF", 3) == 0 && settings.output_type.size() == 3){ - reader.getBlockSettings().custom_output_format = false; - std::cerr << tachyon::utility::timestamp("ERROR") << "BCF output not supported yet." << std::endl; - return(1); - } else if(strncmp(&settings.output_type[0], "CUSTOM", 6) == 0 && settings.output_type.size() == 6){ - reader.getBlockSettings().custom_output_format = true; - settings.custom_output_format = true; - } else { - std::cerr << tachyon::utility::timestamp("ERROR") << "Unrecognised output option: " << settings.output_type << "..." << std::endl; - return(1); - } - } // If user is triggering annotation if(settings.annotate_genotypes){ - reader.getBlockSettings().annotate_extra = true; - reader.getBlockSettings().loadGenotypes(true); - reader.getBlockSettings().set_membership(true, true); - reader.getBlockSettings().alleles(true, true); - reader.getBlockSettings().positions(true, true); + reader.GetBlockSettings().annotate_extra = true; + reader.GetBlockSettings().LoadGenotypes(true).LoadMinimumVcf(true); + if(settings.drop_format) reader.GetBlockSettings().DisplayWrapper(false, YON_BLK_BV_GT); + reader.GetGlobalHeader().AddGenotypeAnnotationFields(); } if(filters.doRequireGenotypes()){ - reader.getBlockSettings().loadGenotypes(true); - reader.getBlockSettings().set_membership.load = true; - reader.getBlockSettings().alleles.load = true; - reader.getBlockSettings().positions.load = true; + reader.GetBlockSettings().LoadGenotypes(true).LoadMinimumVcf(true); + if(settings.drop_format) reader.GetBlockSettings().DisplayWrapper(false, YON_BLK_BV_GT); } - reader.getBlockSettings().parseSettings(reader.getGlobalHeader()); + reader.GetSettings() = settings; tachyon::algorithm::Timer timer; timer.Start(); - if(settings.show_header) reader.getBlockSettings().show_vcf_header = true; - else reader.getBlockSettings().show_vcf_header = false; + if(settings.show_header) reader.GetBlockSettings().show_vcf_header = true; + else reader.GetBlockSettings().show_vcf_header = false; - if(reader.addIntervals(interval_strings) == false) return(1); + if(reader.AddIntervals(interval_strings) == false) return(1); - U64 n_variants = 0; - if(settings.custom_output_format) n_variants = reader.outputCustom(); - else n_variants = reader.outputVCF(); - //std::cerr << "Blocks: " << n_blocks << std::endl; - /* - std::cerr << "Variants: " - << tachyon::utility::ToPrettyString(n_variants) << " genotypes: " - << tachyon::utility::ToPrettyString(n_variants*reader.header.getSampleNumber()) << '\t' - << timer.ElapsedString() << '\t' - << tachyon::utility::ToPrettyString((U64)((double)n_variants*reader.header.getSampleNumber()/timer.Elapsed().count())) - << std::endl; - */ + reader.OutputRecords(); return 0; } diff --git a/lib_example/calculate_depth_profile.cpp b/lib_example/calculate_depth_profile.cpp index 66ae658..78ca06c 100644 --- a/lib_example/calculate_depth_profile.cpp +++ b/lib_example/calculate_depth_profile.cpp @@ -40,7 +40,6 @@ int main(int argc, char** argv){ } std::vector depth_data(reader.getGlobalHeader().getSampleNumber()); - std::vector depth_data_truncated(reader.getGlobalHeader().getSampleNumber()); /**< * In this example we will write a simple program to calculate @@ -58,21 +57,17 @@ int main(int argc, char** argv){ for(U32 sample = 0; sample < dp_container->at(variant).size(); ++sample){ if(dp_container->at(variant).at(sample).size()){ depth_data[sample].addNonzero(dp_container->at(variant).at(sample)[0]); - depth_data_truncated[sample].addNonzero(dp_container->at(variant).at(sample)[0] > 100 ? 100 : dp_container->at(variant).at(sample)[0]); - - } + } } } } delete dp_container; } - std::cout << "Sample\tMean\tSD\tMin\tMax\tN\tMeanTrunc\tSDTrunc\tMinTrunc\tMaxTrunc\tNTrunc\n"; + std::cout << "Sample\tMean\tSD\tMin\tMax\tN\n"; for(U32 i = 0; i < reader.getGlobalHeader().getSampleNumber(); ++i){ depth_data[i].calculate(); - depth_data_truncated[i].calculate(); - std::cout << reader.getGlobalHeader().samples[i].name << "\t" << depth_data[i].mean << "\t" << depth_data[i].getStandardDeviation() << "\t" << depth_data[i].min << "\t" << depth_data[i].max << "\t" << depth_data[i].getCount() << "\t" - << depth_data_truncated[i].mean << "\t" << depth_data_truncated[i].getStandardDeviation() << "\t" << depth_data_truncated[i].min << "\t" << depth_data_truncated[i].max << "\t" << depth_data_truncated[i].getCount() << "\n"; + std::cout << reader.getGlobalHeader().samples[i].name << "\t" << depth_data[i].mean << "\t" << depth_data[i].getStandardDeviation() << "\t" << depth_data[i].min << "\t" << depth_data[i].max << "\t" << depth_data[i].getCount() << "\n"; } std::cout.flush(); diff --git a/lib_example/format_container_raw.cpp b/lib_example/format_container_raw.cpp new file mode 100644 index 0000000..e96e191 --- /dev/null +++ b/lib_example/format_container_raw.cpp @@ -0,0 +1,68 @@ +/* +Copyright (C) 2017-current Genome Research Ltd. +Author: Marcus D. R. Klarqvist + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +==============================================================================*/ + +#include "variant_reader.h" + +int main(int argc, char** argv){ + if(argc < 2){ + std::cerr << tachyon::utility::timestamp("ERROR") << "Have to provide an input name..." << std::endl; + return(1); + } + + std::string my_input_file(argv[1]); + tachyon::VariantReader reader; + + reader.getBlockSettings().loadFORMAT("PGT"); + + if(!reader.open(my_input_file)){ + std::cerr << tachyon::utility::timestamp("ERROR") << "Failed to open file: " << my_input_file << "..." << std::endl; + return(1); + } + + /**< + * The `FormatContainer` class stores the data for each variant + * for each individual as container[variant][sample][data] + */ + while(reader.nextBlock()){ // As long as there are YON blocks available + // Meta container + tachyon::containers::MetaContainer meta(reader.getCurrentBlock().getBlock()); + + // FORMAT container with U32 return type primitive + tachyon::containers::FormatContainer* pgt_container = reader.getCurrentBlock().get_format_container("PGT"); + + if(pgt_container != nullptr){ + for(U32 variant = 0; variant < pgt_container->size(); ++variant){ + for(U32 sample = 0; sample < pgt_container->at(variant).size(); ++sample){ + // Write the data to `cout` in `VCF` formatting + tachyon::utility::to_vcf_string(std::cout, pgt_container->at(variant).at(sample)) << ' '; + } + std::cout << '\n'; + } + std::cout << '\n'; + } + delete pgt_container; + + } + + return(0); +} diff --git a/lib_example/genotype_likelihoods.cpp b/lib_example/genotype_likelihoods.cpp new file mode 100644 index 0000000..49ee938 --- /dev/null +++ b/lib_example/genotype_likelihoods.cpp @@ -0,0 +1,75 @@ +/* +Copyright (C) 2017-current Genome Research Ltd. +Author: Marcus D. R. Klarqvist + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +==============================================================================*/ + +#include "variant_reader.h" + +int main(int argc, char** argv){ + if(argc < 2){ + std::cerr << tachyon::utility::timestamp("ERROR") << "Have to provide an input name..." << std::endl; + return(1); + } + + std::string my_input_file(argv[1]); + tachyon::VariantReader reader; + + reader.getBlockSettings().loadFORMAT("PL"); + + if(!reader.open(my_input_file)){ + std::cerr << tachyon::utility::timestamp("ERROR") << "Failed to open file: " << my_input_file << "..." << std::endl; + return(1); + } + + /**< + * The `FormatContainer` class stores the data for each variant + * for each individual as container[variant][sample][data]. In this example + * we will use the FORMAT `PL` that generally describes the genotype likelihoods + * as determined by the variant caller + */ + while(reader.nextBlock()){ // As long as there are YON blocks available + // Meta container + tachyon::containers::MetaContainer meta(reader.getCurrentBlock().getBlock()); + + // FORMAT container with double return type primitive + tachyon::containers::FormatContainer* pl_container = reader.getCurrentBlock().get_balanced_format_container("PL", meta); + if(pl_container == nullptr) continue; + + // Iterate over PL container + for(U32 variant = 0; variant < pl_container->size(); ++variant){ + tachyon::utility::to_vcf_string(std::cout, '\t', meta[variant], reader.getGlobalHeader()); + std::cout << '\t'; + for(U32 sample = 0; sample < pl_container->at(variant).size(); ++sample){ + // Write the data to `cout` in `VCF` formatting + std::cout << ' '; + for(U32 entry = 0; entry < pl_container->at(variant).at(sample).size(); ++entry){ + std::cout << "," << pl_container->at(variant).at(sample).at(entry); + } + //tachyon::utility::to_vcf_string(std::cout, pl_container->at(variant).at(sample)) << ','; + } + std::cout << '\n'; + } + std::cout << '\n'; + delete pl_container; + } + + return(0); +} diff --git a/makefile b/makefile index a2770b3..0af9ba0 100644 --- a/makefile +++ b/makefile @@ -37,7 +37,7 @@ PREFIX := /usr/local # If you want to build in debug mode then add DEBUG=true to your build command # make DEBUG=true ifdef DEBUG -DEBUG_FLAGS := -g -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ +DEBUG_FLAGS := -g -Wall -Wextra -Wcast-qual -Wcast-align \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ @@ -51,14 +51,21 @@ INCLUDE_PATH = -I./lib/ ZSTD_LIBRARY_PATH = # Check if ZSTD is in the current directory +UNAME_R := $(shell uname -r) ifneq ("$(wildcard ./zstd/)","") INCLUDE_PATH += -I./zstd/lib/ -I./zstd/lib/common/ ZSTD_LIBRARY_PATH = -L./zstd/lib -else ifneq ("$(wildcard /usr/local/include/)","") +else ifneq ("$(wildcard /usr/local/include/zstd.h)","") INCLUDE_PATH += -I/usr/local/include/ #ZSTD_LIBRARY_PATH = -L/usr/local/lib +else ifneq ("$(wildcard /usr/src/linux-headers-$(UNAME_R)/include/linux/zstd.h)","") + INCLUDE_PATH += -I/usr/src/linux-headers-$(UNAME_R)/include/linux/ + #ZSTD_LIBRARY_PATH = -L/usr/src/linux-headers-$(uname -r)/lib +else + INCLUDE_PATH += "-I/usr/src/linux-headers-$(UNAME_R)/include/linux/" endif + # Try to deduce where OpenSSL is located OPENSSL_LIBRARY_PATH = ifneq ("$(wildcard ./openssl/)","") @@ -72,7 +79,24 @@ else ifneq ("$(wildcard /usr/include/openssl/evp.h)","") OPENSSL_LIBRARY_PATH = -L/usr/lib/x86_64-linux-gnu/ endif -LIBRARY_PATHS := $(ZSTD_LIBRARY_PATH) $(OPENSSL_LIBRARY_PATH) -L/usr/local/lib/ +# Try to deduce where HTSLib is located +HSLIB_LIBRARY_PATH = +ifneq ("$(wildcard ./htslib/)","") + INCLUDE_PATH += -I./htslib/ + HSLIB_LIBRARY_PATH = -L./htslib/ +else ifneq ("$(wildcard /usr/local/include/htslib/)","") + INCLUDE_PATH += -I/usr/local/include/ + #OPENSSL_LIBRARY_PATH = -L/usr/local/lib/ +endif + +# Sort the include_path vector of strings to remove duplicates. This doesn't have +# any functional effect but dedupes the vector. +# Do NOT use equal sign here as the lazy evaluaton will throw a recusion +# warning. +INCLUDE_PATH := $(sort $(INCLUDE_PATH)) + +# Library paths +LIBRARY_PATHS := $(ZSTD_LIBRARY_PATH) $(OPENSSL_LIBRARY_PATH) $(HSLIB_LIBRARY_PATH) -L/usr/local/lib/ OPTFLAGS := -O3 -msse4.2 # Legacy flags used @@ -86,17 +110,18 @@ endif # see : https://developer.apple.com/library/mac/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html ifneq ($(shell uname), Darwin) SHARED_EXT = so -LD_LIB_FLAGS = -shared -Wl,-rpath,./zstd/lib,-rpath,./openssl/,-soname,libtachyon.$(SHARED_EXT) +LD_LIB_FLAGS = -shared '-Wl,-rpath-link,$$ORIGIN/,-rpath-link,$(PWD),-rpath-link,$$ORIGIN/zstd/lib,-rpath-link,$$ORIGIN/openssl,-rpath-link,$$ORIGIN/htslib,-soname,libtachyon.$(SHARED_EXT)' else SHARED_EXT = dylib -LD_LIB_FLAGS = -dynamiclib -install_name libtachyon.$(SHARED_EXT) -Wl,-rpath,./zstd/lib,-rpath,./openssl/ +LD_LIB_FLAGS = -dynamiclib -install_name libtachyon.$(SHARED_EXT) '-Wl,-rpath-link,$$ORIGIN/,-rpath-link,$(PWD),-rpath-link,$$ORIGIN/zstd/lib,-rpath-link,$$ORIGIN/openssl,-rpath-link,$$ORIGIN/htslib' endif CXXFLAGS = -std=c++0x $(OPTFLAGS) $(DEBUG_FLAGS) CFLAGS = -std=c99 $(OPTFLAGS) $(DEBUG_FLAGS) -BINARY_RPATHS = '-Wl,-rpath,$$ORIGIN/zstd/lib,-rpath,$$ORIGIN/openssl/' +CFLAGS_VENDOR = -std=c99 $(OPTFLAGS) +BINARY_RPATHS = '-Wl,-rpath,$$ORIGIN/,-rpath,$(PWD),-rpath,$$ORIGIN/zstd/lib,-rpath,$$ORIGIN/openssl,-rpath,$$ORIGIN/htslib' -LIBS := -lzstd -lcrypto +LIBS := -lzstd -lcrypto -lhts CXX_SOURCE = $(wildcard lib/algorithm/compression/*.cpp) \ $(wildcard lib/algorithm/digest/*.cpp) \ $(wildcard lib/algorithm/encryption/*.cpp) \ @@ -105,6 +130,7 @@ CXX_SOURCE = $(wildcard lib/algorithm/compression/*.cpp) \ $(wildcard lib/containers/components/*.cpp) \ $(wildcard lib/core/header/*.cpp) \ $(wildcard lib/core/*.cpp) \ + $(wildcard lib/index/*.cpp) \ $(wildcard lib/io/*.cpp) \ $(wildcard lib/io/bcf/*.cpp) \ $(wildcard lib/io/compression/*.cpp) \ @@ -116,27 +142,12 @@ CXX_SOURCE = $(wildcard lib/algorithm/compression/*.cpp) \ C_SOURCE = \ lib/third_party/xxhash/xxhash.c \ -lib/third_party/zlib/adler32.c \ -lib/third_party/zlib/crc32.c \ -lib/third_party/zlib/deflate.c \ -lib/third_party/zlib/infback.c \ -lib/third_party/zlib/inffast.c \ -lib/third_party/zlib/inflate.c \ -lib/third_party/zlib/inftrees.c \ -lib/third_party/zlib/trees.c \ -lib/third_party/zlib/zutil.c \ -lib/third_party/zlib/compress.c \ -lib/third_party/zlib/uncompr.c \ -lib/third_party/zlib/gzclose.c \ -lib/third_party/zlib/gzlib.c \ -lib/third_party/zlib/gzread.c \ -lib/third_party/zlib/gzwrite.c \ OBJECTS = $(CXX_SOURCE:.cpp=.o) $(C_SOURCE:.c=.o) CPP_DEPS = $(CXX_SOURCE:.cpp=.d) $(C_SOURCE:.c=.d) LIB_INCLUDE_PATH = -I./lib/ -LIB_EXAMPLE_FLAGS = -L./ -ltachyon '-Wl,-rpath,$$ORIGIN/../,-rpath,$(PWD),-rpath,$$ORIGIN/../zstd/lib,-rpath,$$ORIGIN/../openssl' +LIB_EXAMPLE_FLAGS = -L./ -ltachyon '-Wl,-rpath,$$ORIGIN/../,-rpath,$(PWD)' LIB_EXAMPLE_SOURCE = $(wildcard lib_example/*.cpp) LIB_EXAMPLE_OUTPUT = $(LIB_EXAMPLE_SOURCE:.cpp=) @@ -153,10 +164,7 @@ all: tachyon # Third party rules lib/third_party/xxhash/%.o: lib/third_party/xxhash/%.c - gcc $(CFLAGS) -c -o $@ $< - -lib/third_party/zlib/%.o: lib/third_party/zlib/%.c - gcc $(CFLAGS) -c -o $@ $< + gcc $(CFLAGS_VENDOR) -c -o $@ $< # Generic rules %.o: %.cpp @@ -166,7 +174,7 @@ tachyon: $(OBJECTS) g++ $(BINARY_RPATHS) $(LIBRARY_PATHS) -pthread $(OBJECTS) $(LIBS) -o tachyon $(MAKE) cleanmost $(MAKE) library library=true - $(MAKE) examples + #$(MAKE) examples library: $(OBJECTS) @echo 'Building dynamic library...'