Skip to content

Commit

Permalink
feat: add param to VCFAnnotator to optionally require validation chec…
Browse files Browse the repository at this point in the history
…ks (#345)

* feat: add param to VCFAnnotator to optionally require validation checks

* Adds `require_validation` parameter. If `True`, validation checks must
  pass in order to return a VRS object. If validation checks fail, a
ValueError will be raised. If `False`, then the VRS object will be
returned even if validation checks fail.

* add new line

* add require_validation as param

* update default
  • Loading branch information
korikuzma committed Feb 15, 2024
1 parent 18d8898 commit 0a7f817
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 13 deletions.
49 changes: 39 additions & 10 deletions src/ga4gh/vrs/extras/vcf_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,17 @@ class SeqRepoProxyType(str, Enum):
show_default=True,
help="Skip VRS computation for REF alleles."
)
@click.option(
"--require_validation",
is_flag=True,
default=False,
show_default=True,
help="Require validation checks to pass in order to return a VRS object"
)
def annotate_click( # pylint: disable=too-many-arguments
vcf_in: str, vcf_out: Optional[str], vrs_pickle_out: Optional[str],
vrs_attributes: bool, seqrepo_dp_type: SeqRepoProxyType, seqrepo_root_dir: str,
seqrepo_base_url: str, assembly: str, skip_ref: bool
seqrepo_base_url: str, assembly: str, skip_ref: bool, require_validation: bool
) -> None:
"""Annotate VCF file via click
Expand All @@ -118,7 +125,11 @@ def annotate_click( # pylint: disable=too-many-arguments
msg = f"Annotating {vcf_in} with the VCF Annotator..."
_logger.info(msg)
click.echo(msg)
annotator.annotate(vcf_in, vcf_out, vrs_pickle_out, vrs_attributes, assembly, (not skip_ref))
annotator.annotate(
vcf_in, vcf_out=vcf_out, vrs_pickle_out=vrs_pickle_out,
vrs_attributes=vrs_attributes, assembly=assembly,
compute_for_ref=(not skip_ref), require_validation=require_validation
)
end = timer()
msg = f"VCF Annotator finished in {(end - start):.5f} seconds"
_logger.info(msg)
Expand Down Expand Up @@ -166,7 +177,8 @@ def __init__(self, seqrepo_dp_type: SeqRepoProxyType = SeqRepoProxyType.LOCAL,
def annotate( # pylint: disable=too-many-arguments,too-many-locals
self, vcf_in: str, vcf_out: Optional[str] = None,
vrs_pickle_out: Optional[str] = None, vrs_attributes: bool = False,
assembly: str = "GRCh38", compute_for_ref: bool = True
assembly: str = "GRCh38", compute_for_ref: bool = True,
require_validation: bool = True
) -> None:
"""Annotates an input VCF file with VRS Allele IDs & creates a pickle file
containing the vrs object information.
Expand All @@ -179,6 +191,9 @@ def annotate( # pylint: disable=too-many-arguments,too-many-locals
Only used if `vcf_out` is provided.
:param str assembly: The assembly used in `vcf_in` data
:param compute_for_ref: If true, compute VRS IDs for the reference allele
:param bool require_validation: If `True` then validation checks must pass in
order to return a VRS object. If `False` then VRS object will be returned
even if validation checks fail.
"""
if not any((vcf_out, vrs_pickle_out)):
raise VCFAnnotatorException(
Expand Down Expand Up @@ -228,8 +243,10 @@ def annotate( # pylint: disable=too-many-arguments,too-many-locals
additional_info_fields += [self.VRS_STARTS_FIELD, self.VRS_ENDS_FIELD, self.VRS_STATES_FIELD]
try:
vrs_field_data = self._get_vrs_data(
record, vrs_data, assembly, additional_info_fields, vrs_attributes,
output_pickle, output_vcf, compute_for_ref
record, vrs_data, assembly, additional_info_fields,
vrs_attributes=vrs_attributes, output_pickle=output_pickle,
output_vcf=output_vcf, compute_for_ref=compute_for_ref,
require_validation=require_validation
)
except Exception as ex:
_logger.exception("VRS error on %s-%s", record.chrom, record.pos)
Expand Down Expand Up @@ -258,7 +275,8 @@ def annotate( # pylint: disable=too-many-arguments,too-many-locals
def _get_vrs_object( # pylint: disable=too-many-arguments,too-many-locals
self, vcf_coords: str, vrs_data: Dict, vrs_field_data: Dict, assembly: str,
vrs_data_key: Optional[str] = None, output_pickle: bool = True,
output_vcf: bool = False, vrs_attributes: bool = False
output_vcf: bool = False, vrs_attributes: bool = False,
require_validation: bool = True
) -> None:
"""Get VRS Object given `vcf_coords`. `vrs_data` and `vrs_field_data` will
be mutated.
Expand All @@ -278,9 +296,16 @@ def _get_vrs_object( # pylint: disable=too-many-arguments,too-many-locals
:param bool vrs_attributes: If `True` will include VRS_Start, VRS_End,
VRS_State fields in the INFO field. If `False` will not include these fields.
Only used if `output_vcf` set to `True`.
:param bool require_validation: If `True` then validation checks must pass in
order to return a VRS object. If `False` then VRS object will be returned
even if validation checks fail. Defaults to `True`.
"""
try:
vrs_obj = self.tlr._from_gnomad(vcf_coords, assembly_name=assembly)
vrs_obj = self.tlr._from_gnomad(
vcf_coords,
assembly_name=assembly,
require_validation=require_validation
)
except (ValidationError, TranslatorValidationError) as e:
vrs_obj = None
_logger.error("ValidationError when translating %s from gnomad: %s", vcf_coords, str(e))
Expand Down Expand Up @@ -330,7 +355,7 @@ def _get_vrs_data( # pylint: disable=too-many-arguments,too-many-locals
self, record: pysam.VariantRecord, vrs_data: Dict, assembly: str, # pylint: disable=no-member
additional_info_fields: List[str], vrs_attributes: bool = False,
output_pickle: bool = True, output_vcf: bool = True,
compute_for_ref: bool = True
compute_for_ref: bool = True, require_validation: bool = True
) -> Dict:
"""Get VRS data for record's reference and alt alleles.
Expand All @@ -351,6 +376,10 @@ def _get_vrs_data( # pylint: disable=too-many-arguments,too-many-locals
of associated values. If `output_vcf = False`, an empty dictionary will be
returned.
:param compute_for_ref: If true, compute VRS IDs for the reference allele
:param bool require_validation: If `True` then validation checks must pass in
order to return a VRS object. A `ValidationError` will be raised if
validation checks fail. If `False` then VRS object will be returned even if
validation checks fail. Defaults to `True`.
"""
vrs_field_data = {field: [] for field in additional_info_fields} if output_vcf else {}

Expand All @@ -361,7 +390,7 @@ def _get_vrs_data( # pylint: disable=too-many-arguments,too-many-locals
self._get_vrs_object(
reference_allele, vrs_data, vrs_field_data, assembly,
output_pickle=output_pickle, output_vcf=output_vcf,
vrs_attributes=vrs_attributes
vrs_attributes=vrs_attributes, require_validation=require_validation
)

# Get VRS data for alts
Expand All @@ -378,7 +407,7 @@ def _get_vrs_data( # pylint: disable=too-many-arguments,too-many-locals
self._get_vrs_object(
allele, vrs_data, vrs_field_data, assembly, vrs_data_key=data,
output_pickle=output_pickle, output_vcf=output_vcf,
vrs_attributes=vrs_attributes
vrs_attributes=vrs_attributes, require_validation=require_validation
)

return vrs_field_data
Expand Down
77 changes: 74 additions & 3 deletions tests/extras/cassettes/test_get_vrs_object_invalid_input.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ interactions:
Content-Type:
- application/json
Date:
- Tue, 19 Sep 2023 05:03:24 GMT
- Tue, 13 Feb 2024 23:37:55 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand Down Expand Up @@ -64,7 +64,7 @@ interactions:
Content-Type:
- application/json
Date:
- Tue, 19 Sep 2023 05:03:24 GMT
- Tue, 13 Feb 2024 23:37:55 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand Down Expand Up @@ -94,7 +94,78 @@ interactions:
Content-Type:
- text/plain; charset=utf-8
Date:
- Tue, 19 Sep 2023 05:03:24 GMT
- Tue, 13 Feb 2024 23:37:55 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul
response:
body:
string: "{\n \"added\": \"2016-08-27T21:23:35Z\",\n \"aliases\": [\n \"GRCh38:7\",\n
\ \"GRCh38:chr7\",\n \"GRCh38.p1:7\",\n \"GRCh38.p1:chr7\",\n \"GRCh38.p10:7\",\n
\ \"GRCh38.p10:chr7\",\n \"GRCh38.p11:7\",\n \"GRCh38.p11:chr7\",\n
\ \"GRCh38.p12:7\",\n \"GRCh38.p12:chr7\",\n \"GRCh38.p2:7\",\n \"GRCh38.p2:chr7\",\n
\ \"GRCh38.p3:7\",\n \"GRCh38.p3:chr7\",\n \"GRCh38.p4:7\",\n \"GRCh38.p4:chr7\",\n
\ \"GRCh38.p5:7\",\n \"GRCh38.p5:chr7\",\n \"GRCh38.p6:7\",\n \"GRCh38.p6:chr7\",\n
\ \"GRCh38.p7:7\",\n \"GRCh38.p7:chr7\",\n \"GRCh38.p8:7\",\n \"GRCh38.p8:chr7\",\n
\ \"GRCh38.p9:7\",\n \"GRCh38.p9:chr7\",\n \"MD5:cc044cc2256a1141212660fb07b6171e\",\n
\ \"NCBI:NC_000007.14\",\n \"refseq:NC_000007.14\",\n \"SEGUID:4+JjCcBVhPCr8vdIhUKFycPv8bY\",\n
\ \"SHA1:e3e26309c05584f0abf2f748854285c9c3eff1b6\",\n \"VMC:GS_F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\",\n
\ \"sha512t24u:F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\",\n \"ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\"\n
\ ],\n \"alphabet\": \"ACGNRSTY\",\n \"length\": 159345973\n}\n"
headers:
Connection:
- close
Content-Length:
- '977'
Content-Type:
- application/json
Date:
- Tue, 13 Feb 2024 23:37:55 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul?start=140753335&end=140753336
response:
body:
string: A
headers:
Connection:
- close
Content-Length:
- '1'
Content-Type:
- text/plain; charset=utf-8
Date:
- Tue, 13 Feb 2024 23:37:55 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand Down
16 changes: 16 additions & 0 deletions tests/extras/test_vcf_annotation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Ensure proper functionality of VCFAnnotator"""
import gzip
import os
import re

import pytest
from ga4gh.vrs.extras.translator import ValidationError

from ga4gh.vrs.extras.vcf_annotation import VCFAnnotator, VCFAnnotatorException

Expand Down Expand Up @@ -162,3 +164,17 @@ def test_get_vrs_object_invalid_input(vcf_annotator, caplog):
# No ALT
vcf_annotator._get_vrs_object("7-140753336-A-.", {}, [], "GRCh38")
assert "None was returned when translating 7-140753336-A-. from gnomad" in caplog.text

# Invalid ref, but not requiring validation checks so no error is raised
vcf_annotator._get_vrs_object(
"7-140753336-G-T", {}, [], "GRCh38", require_validation=False
)
assert "Expected reference sequence G on GRCh38:7 at positions (140753335, 140753336) but found A" in caplog.text

# Invalid ref, but requiring validation checks so an error is raised
invalid_ref_seq_msg = "Expected reference sequence C on GRCh38:7 at positions (140753335, 140753336) but found A"
with pytest.raises(ValidationError, match=re.escape(invalid_ref_seq_msg)):
vcf_annotator._get_vrs_object(
"7-140753336-C-T", {}, [], "GRCh38", require_validation=True
)
assert invalid_ref_seq_msg in caplog.text

0 comments on commit 0a7f817

Please sign in to comment.