Skip to content

Commit

Permalink
fix(preprocessor): remove -refRegion argument from preprocessor
Browse files Browse the repository at this point in the history
  • Loading branch information
markwoon committed Nov 12, 2024
1 parent f429762 commit 5331c2b
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 97 deletions.
10 changes: 3 additions & 7 deletions docs/using/Running-PharmCAT-Pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Standard use case:
```
usage: pharmcat_pipeline [-s <samples> | -S <txt_file>]
[-0] [--absent-to-ref] [-unspecified-to-ref] [-G]
[-R] [-refRegion <bed_file>]
[-R <bed_file>]
[-matcher] [-ma] [-matcherHtml] [-research <type>]
[-phenotyper]
[-reporter] [-rs <sources>] [-re] [-reporterJson]
Expand Down Expand Up @@ -73,13 +73,9 @@ Preprocessor arguments:
Assume unspecified genotypes ./. as 0/0 when every sample is './.'. DANGEROUS!
-G, --no-gvcf-check
Bypass the gVCF check for the input VCF.
-R, --retain-specific-regions
Retain all variants in genomic regions of interest in VCF.
Defaults to entire region of genes covered by PharmCAT.
-R `<bed_file>`, --retain-specific-regions `<bed_file>`
A sorted .bed file indicating regions to retain in VCF.
For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down.
-refRegion `<bed_file>`, --reference-regions-to-retain `<bed_file>`
A sorted bed file of specific PGx regions to retain.
Use with the `-R` to specify custom regions.
Named allele matcher arguments:
-matcher Run named allele matcher independently.
Expand Down
8 changes: 3 additions & 5 deletions docs/using/VCF-Preprocessor.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,9 @@ look for `pharmcat_positions.vcf.bgz` under the current working directory. You c
: The [GRCh38.p13](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39/) FASTA file. The FASTA file can be either decompressed or compressed but has to be indexed (.fai and, in addition, .gzi for the compressed file). We recommended the compressed reference genome FASTA file for the sake of storage. These mandatory files will be automatically downloaded (~0.9 GB) to the same directory
as the reference PGx VCF file (`-refVcf`) if not provided by user (see [Notes](#notes) for details).

-R <span class="altArg"><br />or --retain-specific-regions</span>
: Retain the genomic regions specified by \'-refRegion\'

-refRegion `<bed_file>` <span class="altArg"><br />or --reference-regions-to-retain `<bed_file>`</span>
: A sorted bed file of specific PGx regions to retain. Must be used with the `-R` argument.
-R `<bed_file>` <span class="altArg"><br />or --retain-specific-regions `<bed_file>`</span>
: A sorted .bed file indicating regions to retain in VCF.
For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down.

-bcftools `</path/to/bcftools>` <span class="altArg"><br />or --path-to-bcftools `</path/to/bcftools>`</span>
: bcftools must be installed. This argument is optional if bcftools is available in your PATH.
Expand Down
39 changes: 10 additions & 29 deletions preprocessor/pharmcat_pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,11 @@ if __name__ == '__main__':
"DANGEROUS!")
preprocessor_group.add_argument("-G", "--no-gvcf-check", action="store_true",
help="Bypass check if VCF file is in gVCF format.")

adv_preprocessor_group = parser.add_argument_group('Advanced preprocessor arguments (research only)')
adv_preprocessor_group.add_argument("-R", "--retain-specific-regions", action="store_true",
help='Retain all variants in genomic regions of interest in VCF. '
'Defaults to entire region of genes covered by PharmCAT. '
'For research use only. '
'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
adv_preprocessor_group.add_argument("-refRegion", "--reference-regions-to-retain",
type=str, metavar='<bed_file>',
help='A sorted .bed file of PGx regions to retain. '
'Use with "-R" to specify custom regions.')
preprocessor_group.add_argument("-R", "--retain-specific-regions",
type=str, metavar='<bed_file>',
help='A sorted .bed file indicating regions to retain in VCF. '
'For research use only. '
'Additional variants are not used by PharmCAT and will slow PharmCAT down.')

# matcher args
matcher_group = parser.add_argument_group('Named allele matcher arguments')
Expand Down Expand Up @@ -217,21 +211,10 @@ lead to different results.
print('Downloading reference FASTA. This may take a while...')
m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent,
verbose=args.verbose)
# make sure we have the pharmcat_regions.bed file if the user choose to retain regions
# validate .bed file if retaining specific regions
m_regions_bed: Optional[Path] = None
if args.retain_specific_regions:
m_retain_specific_regions: bool = True
if args.reference_regions_to_retain:
m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain)
else:
m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME,
list({script_dir}))
if m_pharmcat_regions_bed is None:
print('Downloading %s...' % preprocessor.PHARMCAT_REGIONS_FILENAME)
m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files(
script_dir, download_region_bed=True, verbose=args.verbose)
else:
m_retain_specific_regions: bool = False
m_pharmcat_regions_bed = None
m_regions_bed = preprocessor.validate_file(args.retain_specific_regions)

# prep pharmcat_positions helper files
preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose)
Expand Down Expand Up @@ -315,8 +298,7 @@ lead to different results.
keep_intermediate_files=False,
absent_to_ref=m_absent_to_ref,
unspecified_to_ref=m_unspecified_to_ref,
retain_specific_regions=m_retain_specific_regions,
reference_regions_to_retain=m_pharmcat_regions_bed,
reference_regions_to_retain=m_regions_bed,
concurrent_mode=True,
max_processes=m_max_processes,
verbose=args.verbose,
Expand Down Expand Up @@ -387,8 +369,7 @@ lead to different results.
keep_intermediate_files=False,
absent_to_ref=m_absent_to_ref,
unspecified_to_ref=m_unspecified_to_ref,
retain_specific_regions=m_retain_specific_regions,
reference_regions_to_retain=m_pharmcat_regions_bed,
reference_regions_to_retain=m_regions_bed,
concurrent_mode=True,
max_processes=m_max_processes,
verbose=args.verbose,
Expand Down
32 changes: 8 additions & 24 deletions preprocessor/pharmcat_vcf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys
from pathlib import Path
from timeit import default_timer as timer
from typing import List
from typing import List, Optional

import preprocessor

Expand Down Expand Up @@ -63,14 +63,11 @@
'or the directory the preprocessor is in.')
advanced_group.add_argument("-refFna", "--reference-genome", type=str, metavar='<fna_file>',
help="The Human Reference Genome GRCh38/hg38 in the fasta format.")
advanced_group.add_argument("-R", "--retain-specific-regions", action="store_true",
help='Retain all variants in genomic regions of interest in VCF. '
'Defaults to entire region of genes covered by PharmCAT. '
advanced_group.add_argument("-R", "--retain-specific-regions",
type=str, metavar='<bed_file>',
help='A sorted .bed file indicating regions to retain in VCF. '
'For research use only. '
'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
advanced_group.add_argument("-refRegion", "--reference-regions-to-retain", type=str, metavar='<bed_file>',
help='A sorted .bed file of PGx regions to retain. '
'Use with "-R" to specify custom regions.')
advanced_group.add_argument("-bcftools", "--path-to-bcftools", type=str, metavar='</path/to/bcftools>',
help="Path to the bcftools program. Defaults to bcftools in PATH.")
advanced_group.add_argument("-bgzip", "--path-to-bgzip", type=str, metavar='</path/to/bgzip>',
Expand Down Expand Up @@ -162,22 +159,10 @@
m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent,
verbose=args.verbose)

# make sure we have pharmcat_regions.bed
# validate .bed file if retaining specific regions
m_regions_bed: Optional[Path] = None
if args.retain_specific_regions:
m_retain_specific_regions: bool = True
m_pharmcat_regions_bed: Path
if args.reference_regions_to_retain:
m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain)
else:
m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME,
list({Path.cwd(), script_dir}))
if m_pharmcat_regions_bed is None:
print('Downloading pharmcat_regions.bed...')
m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files(
script_dir, download_region_bed=True, verbose=args.verbose)
else:
m_retain_specific_regions: bool = False
m_pharmcat_regions_bed = None
m_regions_bed = preprocessor.validate_file(args.retain_specific_regions)

# prep pharmcat_positions helper files
preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose)
Expand Down Expand Up @@ -275,8 +260,7 @@
keep_intermediate_files=args.keep_intermediate_files,
absent_to_ref=m_absent_to_ref,
unspecified_to_ref=m_unspecified_to_ref,
retain_specific_regions=m_retain_specific_regions,
reference_regions_to_retain=m_pharmcat_regions_bed,
reference_regions_to_retain=m_regions_bed,
concurrent_mode=args.concurrent_mode,
max_processes=m_max_processes,
verbose=args.verbose,
Expand Down
24 changes: 13 additions & 11 deletions preprocessor/preprocessor/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
output_dir: Path, output_basename: Optional[str] = '', split_samples: bool = False,
keep_intermediate_files: bool = False,
absent_to_ref: bool = False, unspecified_to_ref: bool = False,
retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
reference_regions_to_retain: Path = None,
concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]:
"""
Normalize and prepare the input VCF for PharmCAT.
Expand All @@ -34,7 +34,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
vcf_files, samples,
output_dir, basename,
keep_intermediate_files, absent_to_ref, unspecified_to_ref,
retain_specific_regions, reference_regions_to_retain,
reference_regions_to_retain,
concurrent_mode, max_processes, verbose)
if split_samples and len(samples) > 1:
util.index_vcf(multisample_vcf, verbose)
Expand All @@ -54,8 +54,9 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Path,
vcf_files: List[Path], samples: Optional[List[str]],
output_dir: Path, output_basename: Optional[str] = '',
keep_intermediate_files: bool = False, missing_to_ref: bool = False,
retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
keep_intermediate_files: bool = False,
absent_to_ref: bool = False, unspecified_to_ref: bool = False,
reference_regions_to_retain: Path = None,
concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]:
"""
Normalize and prepare the input VCF for PharmCAT.
Expand All @@ -70,7 +71,7 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa
if samples is None or len(samples) == 0:
file_samples = util.read_vcf_samples(file, verbose=verbose)
else:
# make sure samples are in vcf file
# make sure samples are in the VCF file
vcf_samples = util.read_vcf_samples(vcf_files[0], verbose=verbose)
for sample in samples:
if sample in vcf_samples:
Expand All @@ -82,8 +83,9 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa
multisample_vcf = _preprocess(pharmcat_positions_vcf, reference_genome,
[file], file_samples,
output_dir, basename,
keep_intermediate_files, missing_to_ref,
retain_specific_regions, reference_regions_to_retain,
keep_intermediate_files,
absent_to_ref, unspecified_to_ref,
reference_regions_to_retain,
concurrent_mode, max_processes, verbose)
results.append(finalize_multisample_vcf(multisample_vcf, output_dir, basename))

Expand All @@ -95,14 +97,14 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
output_dir: Path, output_basename: Optional[str] = '',
keep_intermediate_files: bool = False,
absent_to_ref: bool = False, unspecified_to_ref: bool = False,
retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
reference_regions_to_retain: Path = None,
concurrent_mode=False, max_processes=1, verbose: int = 0) -> Path:

# shrink input VCF down to PGx allele defining regions and selected samples
# modify input VCF chromosomes naming format to <chr##>
# standardize chromosome names to <chr##>
pgx_region_vcf: Path = util.extract_pgx_regions(pharmcat_positions_vcf, vcf_files, samples,
output_dir, output_basename,
retain_specific_regions, reference_regions_to_retain,
reference_regions_to_retain,
concurrent_mode=concurrent_mode, max_processes=max_processes,
verbose=verbose)
# normalize the input VCF
Expand All @@ -114,7 +116,7 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
output_dir, output_basename,
absent_to_ref=absent_to_ref,
unspecified_to_ref=unspecified_to_ref,
retain_specific_regions=retain_specific_regions,
retain_specific_regions=bool(reference_regions_to_retain),
verbose=verbose)

if not keep_intermediate_files:
Expand Down
Loading

0 comments on commit 5331c2b

Please sign in to comment.