Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: enhance annotator CLI options #449

Merged
merged 5 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docs/extras/vcf_annotator.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ The tool uses a SeqRepo data proxy. By default, the local instance at `/usr/loca
Example of how to run:

```commandline
python3 -m src.ga4gh.vrs.extras.vcf_annotation --vcf_in input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl
python3 -m src.ga4gh.vrs.extras.vcf_annotation input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl
```

`--vcf_in` specifies the path of the input VCF file to annotate. `--vcf_out` specifies the path of the output annotated VCF file. The `--vrs_pickle_out` specifies the path of the output pickle file containing VRS data (Both vcf_out and vrs_pickle_out are optional, but at least one __must__ be provided).
Pass the path of the input VCF file as the argument to the script. Use either `--vcf_out` to specify the path of the output annotated VCF file, or `--vrs_pickle_out` to specify the path of the output pickle file containing VRS data (both `vcf_out` and `vrs_pickle_out` are optional, but at least one __must__ be provided).

### Use local SeqRepo Data Proxy with different

Expand All @@ -32,7 +32,7 @@ You can change the root directory of SeqRepo by using `seqrepo_root_dir`.
To use the local SeqRepo data proxy with SeqRepo root directory at `vrs-python/seqrepo/latest`:

```commandline
python3 -m src.ga4gh.vrs.extras.vcf_annotation --vcf_in input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_root_dir vrs-python/seqrepo/latest
python3 -m src.ga4gh.vrs.extras.vcf_annotation input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_root_dir vrs-python/seqrepo/latest
```

### Use the REST SeqRepo Data Proxy with default base url
Expand All @@ -42,15 +42,15 @@ You can change the data proxy type by using: `--seqrepo_dp_type` (options are `l
To use the REST SeqRepo data proxy at default url: `http://localhost:5000/seqrepo`:

```commandline
python3 -m src.ga4gh.vrs.extras.vcf_annotation --vcf_in input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_dp_type rest
python3 -m src.ga4gh.vrs.extras.vcf_annotation input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_dp_type rest
```

### Use the REST SeqRepo Data Proxy with different base url
You can change the SeqRepo REST base url by using: `--seqrepo_base_url`.

To use the REST SeqRepo data proxy, at custom url: `http://custom.url:5000/seqrepo`:
```commandline
python3 -m src.ga4gh.vrs.extras.vcf_annotation --vcf_in input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_dp_type rest --seqrepo_base_url http://custom.url:5000/seqrepo
python3 -m src.ga4gh.vrs.extras.vcf_annotation input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl --seqrepo_dp_type rest --seqrepo_base_url http://custom.url:5000/seqrepo
```

### Other Options
Expand All @@ -67,4 +67,4 @@ python3 -m src.ga4gh.vrs.extras.vcf_annotation --vcf_in input.vcf.gz --vcf_out o
>Require validation checks to pass in order to return a VRS object

`--help`
>Show the options available
>Show the options available
63 changes: 32 additions & 31 deletions src/ga4gh/vrs/extras/vcf_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
python3 -m src.ga4gh.vrs.extras.vcf_annotation --vcf_in input.vcf.gz \
--vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl
"""
import pathlib
import logging
import pickle
from enum import Enum
Expand Down Expand Up @@ -35,78 +36,75 @@ class SeqRepoProxyType(str, Enum):


@click.command()
@click.option(
"--vcf_in",
required=True,
type=str,
help="The path for the input VCF file to annotate"
@click.argument(
"vcf_in",
nargs=1,
type=click.Path(exists=True, readable=True, dir_okay=False, path_type=pathlib.Path)
)
@click.option(
"--vcf_out",
required=False,
type=str,
help=("The path for the output VCF file. If not provided, must provide "
type=click.Path(writable=True, allow_dash=False, path_type=pathlib.Path),
help=("Declare save location for output annotated VCF. If not provided, must provide "
"--vrs_pickle_out.")
)
@click.option(
"--vrs_pickle_out",
required=False,
type=str,
help=("The path for the output VCF pickle file. If not provided, must provide "
"--vcf_out")
type=click.Path(writable=True, allow_dash=False, path_type=pathlib.Path),
help=("Declare save location for output VCF pickle. If not provided, must provide "
"--vcf_out.")
)
@click.option(
"--vrs_attributes",
is_flag=True,
default=False,
help="Will include VRS_Start, VRS_End, VRS_State fields in the INFO field.",
show_default=True
help="Include VRS_Start, VRS_End, and VRS_State fields in the VCF output INFO field.",
)
@click.option(
"--seqrepo_dp_type",
required=False,
default=SeqRepoProxyType.LOCAL,
type=click.Choice([v.value for v in SeqRepoProxyType.__members__.values()],
case_sensitive=True),
help="The type of the SeqRepo Data Proxy to use",
help="Specify type of SeqRepo dataproxy to use.",
show_default=True,
show_choices=True
)
@click.option(
"--seqrepo_root_dir",
required=False,
default="/usr/local/share/seqrepo/latest",
help="The root directory for local SeqRepo instance",
default=pathlib.Path("/usr/local/share/seqrepo/latest"),
type=click.Path(path_type=pathlib.Path),
help="Define root directory for local SeqRepo instance, if --seqrepo_dp_type=local.",
show_default=True
)
@click.option(
"--seqrepo_base_url",
required=False,
default="http://localhost:5000/seqrepo",
help="The base url for SeqRepo REST API",
help="Specify base URL for SeqRepo REST API, if --seqrepo_dp_type=rest.",
show_default=True
)
@click.option(
"--assembly",
required=False,
default="GRCh38",
show_default=True,
help="The assembly that the `vcf_in` data uses.",
help="Specify assembly that was used to create input VCF.",
type=str
)
@click.option(
"--skip_ref",
is_flag=True,
default=False,
show_default=True,
help="Skip VRS computation for REF alleles."
)
@click.option(
"--require_validation",
is_flag=True,
default=False,
show_default=True,
help="Require validation checks to pass in order to return a VRS object"
help="Require validation checks to pass to construct a VRS object."
)
@click.option(
"--silent",
Expand All @@ -116,33 +114,36 @@ class SeqRepoProxyType(str, Enum):
help="Suppress messages printed to stdout"
)
def annotate_click( # pylint: disable=too-many-arguments
vcf_in: str, vcf_out: str | None, vrs_pickle_out: str | None,
vrs_attributes: bool, seqrepo_dp_type: SeqRepoProxyType, seqrepo_root_dir: str,
vcf_in: pathlib.Path, vcf_out: pathlib.Path | None, vrs_pickle_out: pathlib.Path | None,
vrs_attributes: bool, seqrepo_dp_type: SeqRepoProxyType, seqrepo_root_dir: pathlib.Path,
seqrepo_base_url: str, assembly: str, skip_ref: bool, require_validation: bool,
silent: bool,
) -> None:
"""Annotate VCF file via click
"""Extract VRS objects from VCF located at VCF_IN.

Example arguments:
$ python3 src/ga4gh/vrs/extras/vcf_annotation.py input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl

--vcf_in input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl
Note that at least one of --vcf_out or --vrs_pickle_out must be selected and defined.
"""
annotator = VCFAnnotator(seqrepo_dp_type, seqrepo_base_url, seqrepo_root_dir)
annotator = VCFAnnotator(seqrepo_dp_type, seqrepo_base_url, str(seqrepo_root_dir.absolute()))
vcf_out_str = str(vcf_out.absolute()) if vcf_out is not None else vcf_out
vrs_pkl_out_str = str(vrs_pickle_out.absolute()) if vrs_pickle_out is not None else vrs_pickle_out
start = timer()
msg = f"Annotating {vcf_in} with the VCF Annotator..."
_logger.info(msg)
if not silent:
click.echo(msg)
annotator.annotate(
vcf_in, vcf_out=vcf_out, vrs_pickle_out=vrs_pickle_out,
str(vcf_in.absolute()), vcf_out=vcf_out_str, vrs_pickle_out=vrs_pkl_out_str,
vrs_attributes=vrs_attributes, assembly=assembly,
compute_for_ref=(not skip_ref), require_validation=require_validation
)
end = timer()
msg = f"VCF Annotator finished in {(end - start):.5f} seconds"
_logger.info(msg)
if not silent:
_logger.info(msg)
click.echo(msg)
click.echo(msg)


class VCFAnnotator: # pylint: disable=too-few-public-methods
"""Annotate VCFs with VRS allele IDs.
Expand All @@ -157,7 +158,7 @@ class VCFAnnotator: # pylint: disable=too-few-public-methods
VRS_ENDS_FIELD = "VRS_Ends"
VRS_STATES_FIELD = "VRS_States"
VRS_ERROR_FIELD = "VRS_Error"
# VCF character escape map
# VCF character escape map
VCF_ESCAPE_MAP = [
("%", "%25"),
(";", "%3B"),
Expand Down Expand Up @@ -420,6 +421,6 @@ def _get_vrs_data( # pylint: disable=too-many-arguments,too-many-locals


if __name__ == "__main__":
# python3 -m src.ga4gh.vrs.extras.vcf_annotation --vcf_in input.vcf.gz \
# python3 -m src.ga4gh.vrs.extras.vcf_annotation input.vcf.gz \
# --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl
annotate_click() # pylint: disable=no-value-for-parameter