Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow multiallelic variants in transform #232

Merged
merged 8 commits into from
Jan 12, 2024
9 changes: 0 additions & 9 deletions docs/commands/transform.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,6 @@ To match haplotypes as well as their ancestral population labels, use the ``--an

haptools transform --ancestry tests/data/simple-ancestry.vcf tests/data/simple.hap

.. warning::
If your VCF has multi-allelic variants, they must be split into bi-allelic records before you can use ``transform``. After splitting, you should rename the IDs in your file to ensure they remain unique:

.. code-block:: bash

bcftools norm -m- -Ou input.vcf.gz | \
bcftools annotate -Ov --set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' | \
haptools transform -o output.vcf.gz /dev/stdin file.hap

All files used in these examples are described :doc:`here </project_info/example_files>`.


Expand Down
6 changes: 4 additions & 2 deletions docs/formats/genotypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ VCF/BCF

Genotype files must be specified as VCF or BCF files. They can be bgzip-compressed.

To be loaded properly, VCFs must follow the VCF specification. VCFs with duplicate variant IDs do not follow the specification; the IDs must be unique. Please validate your VCF using a tool like `gatk ValidateVariants <https://gatk.broadinstitute.org/hc/en-us/articles/360037057272-ValidateVariants>`_ before using haptools.

.. _formats-genotypesplink:

PLINK2 PGEN
Expand All @@ -24,11 +26,11 @@ If you run out memory when using PGEN files, consider reading/writing variants f

Converting from VCF to PGEN
---------------------------
To convert a VCF containing only biallelic SNPs to PGEN, use the following command.
To convert a VCF containing only SNPs to PGEN, use the following command.

.. code-block:: bash

plink2 --snps-only 'just-acgt' --max-alleles 2 --vcf input.vcf --make-pgen --out output
plink2 --snps-only 'just-acgt' --vcf input.vcf --make-pgen --out output

To convert a VCF containing tandem repeats to PGEN, use this command, instead.

Expand Down
33 changes: 20 additions & 13 deletions haptools/data/haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ def transform(self, genotypes: GenotypesVCF) -> npt.NDArray[bool]:
denotes the presence of the haplotype in one chromosome of a sample
"""
var_IDs = self.varIDs
# ensure the variants in the Genotypes object are ordered according to var_IDs
gts = genotypes.subset(variants=var_IDs)
# check: were any of the variants absent from the genotypes?
if len(gts.variants) < len(var_IDs):
Expand All @@ -490,14 +491,17 @@ def transform(self, genotypes: GenotypesVCF) -> npt.NDArray[bool]:
# note: the excessive use of square-brackets gives us shape (1, p, 1)
# where p denotes the number of alleles in this haplotype
# That shape is broadcastable with gts.data which has shape (n, p, 2)
allele_arr = np.array(
[
try:
allele_arr = np.array(
[
[int(var.allele != gts.variants[i]["alleles"][0])]
for i, var in enumerate(self.variants)
[
[gts.variants[i]["alleles"].index(var.allele)]
for i, var in enumerate(self.variants)
]
]
]
)
)
except ValueError:
raise ValueError("Some alleles were not present in the genotypes")
# look for the presence of each allele in each chromosomal strand
# and then just AND them together
return np.all(allele_arr == gts.data[:, :, :2], axis=1)
Expand Down Expand Up @@ -1367,13 +1371,16 @@ def transform(
self.log.debug(f"Creating array denoting alt allele status")
# initialize a np array denoting the allele integer in each haplotype
# with shape (1, gts.data.shape[1], 1) for broadcasting later
allele_arr = np.array(
[
int(allele != gts.variants[i]["alleles"][0])
for i, (vID, allele) in enumerate(alleles)
],
dtype=gts.data.dtype,
)[np.newaxis, :, np.newaxis]
try:
allele_arr = np.array(
[
gts.variants[i]["alleles"].index(allele)
for i, (vID, allele) in enumerate(alleles)
],
dtype=gts.data.dtype,
)[np.newaxis, :, np.newaxis]
except ValueError:
raise ValueError("Some alleles were not present in the genotypes")
# finally, obtain and merge the haplotype genotypes
self.log.info(f"Transforming genotypes for {len(haps)} haplotypes")
equality_arr = np.equal(allele_arr, gts.data[:, :, :2])
Expand Down
1 change: 0 additions & 1 deletion haptools/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,6 @@ def transform_haps(
# gt._prephased = True
gt.read(region=region, samples=samples, variants=variants)
gt.check_missing(discard_also=discard_missing)
gt.check_biallelic()
gt.check_phase()

# check that all of the variants were loaded successfully and warn otherwise
Expand Down
51 changes: 51 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,16 @@ def _get_dummy_haps(self):
haps.data = haplotypes
return haps

def _get_dummy_haps_multiallelic(self):
haps = self._get_dummy_haps()
h1_vars = list(haps.data["H1"].variants)
h1_vars[1] = Variant(start=10116, end=10117, id="1:10116:A:G", allele="T")
haps.data["H1"].variants = tuple(h1_vars)
h3_vars = list(haps.data["H3"].variants)
h3_vars[1] = Variant(start=10122, end=10123, id="1:10122:A:G", allele="C")
haps.data["H3"].variants = tuple(h3_vars)
return haps

def test_load(self):
expected = self._basic_haps()

Expand Down Expand Up @@ -1553,6 +1563,23 @@ def test_hap_transform(self):
hap_gt = hap.transform(gens)
np.testing.assert_allclose(hap_gt, expected)

def test_hap_transform_multiallelic(self):
expected = np.array(
[
[0, 0],
[0, 0],
[1, 0],
[0, 0],
[0, 1],
],
dtype=np.uint8,
)

hap = list(self._get_dummy_haps_multiallelic().data.values())[0]
gens = TestGenotypesVCF()._get_fake_genotypes_multiallelic()
hap_gt = hap.transform(gens)
np.testing.assert_allclose(hap_gt, expected)

def test_haps_transform(self, return_also=False):
expected = np.array(
[
Expand All @@ -1576,6 +1603,30 @@ def test_haps_transform(self, return_also=False):
if return_also:
return hap_gt

def test_haps_transform_multiallelic(self, return_also=False):
expected = np.array(
[
[[0, 0], [0, 0], [0, 0]],
[[0, 0], [0, 0], [1, 0]],
[[1, 0], [0, 1], [0, 0]],
[[0, 0], [0, 0], [0, 0]],
[[0, 0], [0, 1], [1, 0]],
],
dtype=np.uint8,
)

haps = self._get_dummy_haps_multiallelic()
gens = TestGenotypesVCF()._get_fake_genotypes_multiallelic()
gens.data[[2, 4], 0, 1] = 1
gens.data[[1, 4], 2, 0] = 1
gens.data[[1, 4], 3, 0] = 2
hap_gt = GenotypesVCF(fname=None)
haps.transform(gens, hap_gt)
np.testing.assert_allclose(hap_gt.data, expected)

if return_also:
return hap_gt

def test_hap_gt_write(self):
fname = DATADIR / "simple_haps.vcf"

Expand Down
30 changes: 29 additions & 1 deletion tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
GenotypesAncestry,
)
from haptools.__main__ import main
from .test_data import TestGenotypesVCF
from .test_data import TestGenotypesVCF, TestHaplotypes
from haptools.data import Variant, GenotypesVCF, GenotypesPLINK


Expand Down Expand Up @@ -234,6 +234,34 @@ def test_basic(capfd):
assert result.exit_code == 0


def test_basic_multiallelic(capfd):
expected = """##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tHG00096\tHG00097\tHG00099\tHG00100\tHG00101
1\t10114\tH1\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t1|0\t0|0\t0|1
1\t10114\tH2\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t0|0\t0|0\t0|0
1\t10116\tH3\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t0|0\t0|0\t0|0
"""
gt_file = DATADIR / "simple-multiallelic.vcf"
hp_file = DATADIR / "simple-multiallelic.hap"

# first, create the multiallelic hap file
hp = TestHaplotypes()._get_dummy_haps_multiallelic()
hp.fname = hp_file
hp.write()

cmd = f"transform {gt_file} {hp_file}"
runner = CliRunner()
result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
captured = capfd.readouterr()
assert captured.out == expected
assert result.exit_code == 0

hp.fname.unlink()


def test_basic_subset(capfd):
expected = """##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
Expand Down
Loading