From 5331c2b0dd1f2538763dc7fb5fd598698b58fe89 Mon Sep 17 00:00:00 2001 From: Mark Woon Date: Mon, 11 Nov 2024 18:03:28 -0800 Subject: [PATCH] fix(preprocessor): remove -refRegion argument from preprocessor --- docs/using/Running-PharmCAT-Pipeline.md | 10 ++---- docs/using/VCF-Preprocessor.md | 8 ++--- preprocessor/pharmcat_pipeline | 39 ++++++----------------- preprocessor/pharmcat_vcf_preprocessor.py | 32 +++++-------------- preprocessor/preprocessor/preprocess.py | 24 +++++++------- preprocessor/preprocessor/utilities.py | 38 ++++++++++------------ 6 files changed, 54 insertions(+), 97 deletions(-) diff --git a/docs/using/Running-PharmCAT-Pipeline.md b/docs/using/Running-PharmCAT-Pipeline.md index 98ad5015..96a3d5b6 100644 --- a/docs/using/Running-PharmCAT-Pipeline.md +++ b/docs/using/Running-PharmCAT-Pipeline.md @@ -39,7 +39,7 @@ Standard use case: ``` usage: pharmcat_pipeline [-s | -S ] [-0] [--absent-to-ref] [-unspecified-to-ref] [-G] - [-R] [-refRegion ] + [-R ] [-matcher] [-ma] [-matcherHtml] [-research ] [-phenotyper] [-reporter] [-rs ] [-re] [-reporterJson] @@ -73,13 +73,9 @@ Preprocessor arguments: Assume unspecified genotypes ./. as 0/0 when every sample is './.'. DANGEROUS! -G, --no-gvcf-check Bypass the gVCF check for the input VCF. - -R, --retain-specific-regions - Retain all variants in genomic regions of interest in VCF. - Defaults to entire region of genes covered by PharmCAT. + -R ``, --retain-specific-regions `` + A sorted .bed file indicating regions to retain in VCF. For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down. - -refRegion ``, --reference-regions-to-retain `` - A sorted bed file of specific PGx regions to retain. - Use with the `-R` to specify custom regions. Named allele matcher arguments: -matcher Run named allele matcher independently. diff --git a/docs/using/VCF-Preprocessor.md b/docs/using/VCF-Preprocessor.md index 0ec7e0c4..e962f962 100644 --- a/docs/using/VCF-Preprocessor.md +++ b/docs/using/VCF-Preprocessor.md @@ -154,11 +154,9 @@ look for `pharmcat_positions.vcf.bgz` under the current working directory. You c : The [GRCh38.p13](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39/) FASTA file. The FASTA file can be either decompressed or compressed but has to be indexed (.fai and, in addition, .gzi for the compressed file). We recommended the compressed reference genome FASTA file for the sake of storage. These mandatory files will be automatically downloaded (~0.9 GB) to the same directory as the reference PGx VCF file (`-refVcf`) if not provided by user (see [Notes](#notes) for details). --R
or --retain-specific-regions
-: Retain the genomic regions specified by \'-refRegion\' - --refRegion ``
or --reference-regions-to-retain ``
-: A sorted bed file of specific PGx regions to retain. Must be used with the `-R` argument. +-R ``
or --retain-specific-regions ``
+: A sorted .bed file indicating regions to retain in VCF. +For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down. -bcftools ``
or --path-to-bcftools ``
: bcftools must be installed. This argument is optional if bcftools is available in your PATH. diff --git a/preprocessor/pharmcat_pipeline b/preprocessor/pharmcat_pipeline index 2ebc00a9..9892699c 100644 --- a/preprocessor/pharmcat_pipeline +++ b/preprocessor/pharmcat_pipeline @@ -82,17 +82,11 @@ if __name__ == '__main__': "DANGEROUS!") preprocessor_group.add_argument("-G", "--no-gvcf-check", action="store_true", help="Bypass check if VCF file is in gVCF format.") - - adv_preprocessor_group = parser.add_argument_group('Advanced preprocessor arguments (research only)') - adv_preprocessor_group.add_argument("-R", "--retain-specific-regions", action="store_true", - help='Retain all variants in genomic regions of interest in VCF. ' - 'Defaults to entire region of genes covered by PharmCAT. ' - 'For research use only. ' - 'Additional variants are not used by PharmCAT and will slow PharmCAT down.') - adv_preprocessor_group.add_argument("-refRegion", "--reference-regions-to-retain", - type=str, metavar='', - help='A sorted .bed file of PGx regions to retain. ' - 'Use with "-R" to specify custom regions.') + preprocessor_group.add_argument("-R", "--retain-specific-regions", + type=str, metavar='', + help='A sorted .bed file indicating regions to retain in VCF. ' + 'For research use only. ' + 'Additional variants are not used by PharmCAT and will slow PharmCAT down.') # matcher args matcher_group = parser.add_argument_group('Named allele matcher arguments') @@ -217,21 +211,10 @@ lead to different results. print('Downloading reference FASTA. This may take a while...') m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent, verbose=args.verbose) - # make sure we have the pharmcat_regions.bed file if the user choose to retain regions + # validate .bed file if retaining specific regions + m_regions_bed: Optional[Path] = None if args.retain_specific_regions: - m_retain_specific_regions: bool = True - if args.reference_regions_to_retain: - m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain) - else: - m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME, - list({script_dir})) - if m_pharmcat_regions_bed is None: - print('Downloading %s...' % preprocessor.PHARMCAT_REGIONS_FILENAME) - m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files( - script_dir, download_region_bed=True, verbose=args.verbose) - else: - m_retain_specific_regions: bool = False - m_pharmcat_regions_bed = None + m_regions_bed = preprocessor.validate_file(args.retain_specific_regions) # prep pharmcat_positions helper files preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose) @@ -315,8 +298,7 @@ lead to different results. keep_intermediate_files=False, absent_to_ref=m_absent_to_ref, unspecified_to_ref=m_unspecified_to_ref, - retain_specific_regions=m_retain_specific_regions, - reference_regions_to_retain=m_pharmcat_regions_bed, + reference_regions_to_retain=m_regions_bed, concurrent_mode=True, max_processes=m_max_processes, verbose=args.verbose, @@ -387,8 +369,7 @@ lead to different results. keep_intermediate_files=False, absent_to_ref=m_absent_to_ref, unspecified_to_ref=m_unspecified_to_ref, - retain_specific_regions=m_retain_specific_regions, - reference_regions_to_retain=m_pharmcat_regions_bed, + reference_regions_to_retain=m_regions_bed, concurrent_mode=True, max_processes=m_max_processes, verbose=args.verbose, diff --git a/preprocessor/pharmcat_vcf_preprocessor.py b/preprocessor/pharmcat_vcf_preprocessor.py index de860219..2e4636f3 100644 --- a/preprocessor/pharmcat_vcf_preprocessor.py +++ b/preprocessor/pharmcat_vcf_preprocessor.py @@ -5,7 +5,7 @@ import sys from pathlib import Path from timeit import default_timer as timer -from typing import List +from typing import List, Optional import preprocessor @@ -63,14 +63,11 @@ 'or the directory the preprocessor is in.') advanced_group.add_argument("-refFna", "--reference-genome", type=str, metavar='', help="The Human Reference Genome GRCh38/hg38 in the fasta format.") - advanced_group.add_argument("-R", "--retain-specific-regions", action="store_true", - help='Retain all variants in genomic regions of interest in VCF. ' - 'Defaults to entire region of genes covered by PharmCAT. ' + advanced_group.add_argument("-R", "--retain-specific-regions", + type=str, metavar='', + help='A sorted .bed file indicating regions to retain in VCF. ' 'For research use only. ' 'Additional variants are not used by PharmCAT and will slow PharmCAT down.') - advanced_group.add_argument("-refRegion", "--reference-regions-to-retain", type=str, metavar='', - help='A sorted .bed file of PGx regions to retain. ' - 'Use with "-R" to specify custom regions.') advanced_group.add_argument("-bcftools", "--path-to-bcftools", type=str, metavar='', help="Path to the bcftools program. Defaults to bcftools in PATH.") advanced_group.add_argument("-bgzip", "--path-to-bgzip", type=str, metavar='', @@ -162,22 +159,10 @@ m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent, verbose=args.verbose) - # make sure we have pharmcat_regions.bed + # validate .bed file if retaining specific regions + m_regions_bed: Optional[Path] = None if args.retain_specific_regions: - m_retain_specific_regions: bool = True - m_pharmcat_regions_bed: Path - if args.reference_regions_to_retain: - m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain) - else: - m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME, - list({Path.cwd(), script_dir})) - if m_pharmcat_regions_bed is None: - print('Downloading pharmcat_regions.bed...') - m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files( - script_dir, download_region_bed=True, verbose=args.verbose) - else: - m_retain_specific_regions: bool = False - m_pharmcat_regions_bed = None + m_regions_bed = preprocessor.validate_file(args.retain_specific_regions) # prep pharmcat_positions helper files preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose) @@ -275,8 +260,7 @@ keep_intermediate_files=args.keep_intermediate_files, absent_to_ref=m_absent_to_ref, unspecified_to_ref=m_unspecified_to_ref, - retain_specific_regions=m_retain_specific_regions, - reference_regions_to_retain=m_pharmcat_regions_bed, + reference_regions_to_retain=m_regions_bed, concurrent_mode=args.concurrent_mode, max_processes=m_max_processes, verbose=args.verbose, diff --git a/preprocessor/preprocessor/preprocess.py b/preprocessor/preprocessor/preprocess.py index 1f9b8a46..e83d4135 100644 --- a/preprocessor/preprocessor/preprocess.py +++ b/preprocessor/preprocessor/preprocess.py @@ -11,7 +11,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path, output_dir: Path, output_basename: Optional[str] = '', split_samples: bool = False, keep_intermediate_files: bool = False, absent_to_ref: bool = False, unspecified_to_ref: bool = False, - retain_specific_regions: bool = False, reference_regions_to_retain: Path = None, + reference_regions_to_retain: Path = None, concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]: """ Normalize and prepare the input VCF for PharmCAT. @@ -34,7 +34,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path, vcf_files, samples, output_dir, basename, keep_intermediate_files, absent_to_ref, unspecified_to_ref, - retain_specific_regions, reference_regions_to_retain, + reference_regions_to_retain, concurrent_mode, max_processes, verbose) if split_samples and len(samples) > 1: util.index_vcf(multisample_vcf, verbose) @@ -54,8 +54,9 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path, def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Path, vcf_files: List[Path], samples: Optional[List[str]], output_dir: Path, output_basename: Optional[str] = '', - keep_intermediate_files: bool = False, missing_to_ref: bool = False, - retain_specific_regions: bool = False, reference_regions_to_retain: Path = None, + keep_intermediate_files: bool = False, + absent_to_ref: bool = False, unspecified_to_ref: bool = False, + reference_regions_to_retain: Path = None, concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]: """ Normalize and prepare the input VCF for PharmCAT. @@ -70,7 +71,7 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa if samples is None or len(samples) == 0: file_samples = util.read_vcf_samples(file, verbose=verbose) else: - # make sure samples are in vcf file + # make sure samples are in the VCF file vcf_samples = util.read_vcf_samples(vcf_files[0], verbose=verbose) for sample in samples: if sample in vcf_samples: @@ -82,8 +83,9 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa multisample_vcf = _preprocess(pharmcat_positions_vcf, reference_genome, [file], file_samples, output_dir, basename, - keep_intermediate_files, missing_to_ref, - retain_specific_regions, reference_regions_to_retain, + keep_intermediate_files, + absent_to_ref, unspecified_to_ref, + reference_regions_to_retain, concurrent_mode, max_processes, verbose) results.append(finalize_multisample_vcf(multisample_vcf, output_dir, basename)) @@ -95,14 +97,14 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path, output_dir: Path, output_basename: Optional[str] = '', keep_intermediate_files: bool = False, absent_to_ref: bool = False, unspecified_to_ref: bool = False, - retain_specific_regions: bool = False, reference_regions_to_retain: Path = None, + reference_regions_to_retain: Path = None, concurrent_mode=False, max_processes=1, verbose: int = 0) -> Path: # shrink input VCF down to PGx allele defining regions and selected samples - # modify input VCF chromosomes naming format to + # standardize chromosome names to pgx_region_vcf: Path = util.extract_pgx_regions(pharmcat_positions_vcf, vcf_files, samples, output_dir, output_basename, - retain_specific_regions, reference_regions_to_retain, + reference_regions_to_retain, concurrent_mode=concurrent_mode, max_processes=max_processes, verbose=verbose) # normalize the input VCF @@ -114,7 +116,7 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path, output_dir, output_basename, absent_to_ref=absent_to_ref, unspecified_to_ref=unspecified_to_ref, - retain_specific_regions=retain_specific_regions, + retain_specific_regions=bool(reference_regions_to_retain), verbose=verbose) if not keep_intermediate_files: diff --git a/preprocessor/preprocessor/utilities.py b/preprocessor/preprocessor/utilities.py index 9177ed09..affda6a8 100644 --- a/preprocessor/preprocessor/utilities.py +++ b/preprocessor/preprocessor/utilities.py @@ -19,6 +19,7 @@ from urllib.error import HTTPError import pandas as pd +from colorama import Fore, Style from packaging import version from . import common @@ -394,8 +395,8 @@ def read_sample_file(sample_file: Path, verbose: int = 0) -> List[str]: if line and not line.startswith("#"): samples.append(line) if len(samples) == 0: - # TODO(markwoon): use colorama to highlight this in next release - print("Warning: No samples found. Will use all samples listed in VCF.") + print(Fore.RED + "Warning: No samples found. Will use all samples listed in VCF." + + Style.RESET_ALL) else: validate_samples(samples) return samples @@ -526,20 +527,15 @@ def download_from_url(url: str, download_dir: Path, force_update: bool = False, raise InvalidURL(url) -def download_pharmcat_accessory_files(download_dir: Union[Path, str], force_update: bool = False, - download_region_bed: bool = False, verbose: int = 0): +def download_pharmcat_accessory_files(download_dir: Union[Path, str], force_update: bool = False, verbose: int = 0): if not isinstance(download_dir, Path): download_dir = Path(download_dir) pos_file = download_dir / common.PHARMCAT_POSITIONS_FILENAME if pos_file.exists() and not force_update: return pos_file - if download_region_bed: - url = 'https://github.com/PharmGKB/PharmCAT/releases/download/v%s/pharmcat_regions_%s.bed' % \ - (common.PHARMCAT_VERSION, common.PHARMCAT_VERSION) - else: - url = 'https://github.com/PharmGKB/PharmCAT/releases/download/v%s/pharmcat_positions_%s.vcf.bgz' %\ - (common.PHARMCAT_VERSION, common.PHARMCAT_VERSION) + url = 'https://github.com/PharmGKB/PharmCAT/releases/download/v%s/pharmcat_positions_%s.vcf.bgz' %\ + (common.PHARMCAT_VERSION, common.PHARMCAT_VERSION) try: dl_file = download_from_url(url, download_dir, force_update, verbose) # use shutil.move instead of rename to deal with cross-device issues @@ -656,7 +652,7 @@ def _is_valid_chr(vcf_file: Path) -> bool: def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples: List[str], output_dir: Path, output_basename: str, - retain_specific_regions: bool = False, reference_regions_to_retain: Path = None, + reference_regions_to_retain: Path = None, concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> Path: """ @@ -678,7 +674,7 @@ def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples if len(vcf_files) == 1: # this should create pgx_region_vcf_file _extract_pgx_regions(pharmcat_positions, vcf_files[0], tmp_sample_file, output_dir, output_basename, - retain_specific_regions, reference_regions_to_retain, verbose) + reference_regions_to_retain, verbose) else: # generate files to be concatenated tmp_files: List[Path] = [] @@ -688,16 +684,16 @@ def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples futures = [] for vcf_file in vcf_files: futures.append(e.submit(_extract_pgx_regions, pharmcat_positions, vcf_file, tmp_sample_file, - output_dir, get_vcf_basename(vcf_file), retain_specific_regions, - reference_regions_to_retain, verbose)) + output_dir, get_vcf_basename(vcf_file), reference_regions_to_retain, + verbose)) concurrent.futures.wait(futures, return_when=ALL_COMPLETED) for future in futures: tmp_files.append(future.result()) else: for vcf_file in vcf_files: tmp_files.append(_extract_pgx_regions(pharmcat_positions, vcf_file, tmp_sample_file, output_dir, - get_vcf_basename(vcf_file), retain_specific_regions, - reference_regions_to_retain, verbose)) + get_vcf_basename(vcf_file), reference_regions_to_retain, + verbose)) # write file names to txt file for bcftools tmp_file_list = tmp_dir / 'regions.txt' with open(tmp_file_list, 'w+') as w: @@ -718,7 +714,7 @@ def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples def _extract_pgx_regions(pharmcat_positions: Path, vcf_file: Path, sample_file: Path, output_dir: Path, output_basename: Optional[str], - retain_specific_regions: bool = False, reference_regions_to_retain: Path = None, + reference_regions_to_retain: Path = None, verbose: int = 0) -> Path: """ Does the actual work to extract PGx regions from input VCF file(s) into a single VCF file and @@ -734,7 +730,7 @@ def _extract_pgx_regions(pharmcat_positions: Path, vcf_file: Path, sample_file: if idx_file is None: index_vcf(bgz_file, verbose) - if retain_specific_regions: + if reference_regions_to_retain: df_regions_to_retain = pd.read_csv(reference_regions_to_retain, sep="\t", header=None) df_regions_to_retain = df_regions_to_retain[[0, 1, 2]] df_regions_to_retain.columns = ['CHROM', 'chromStart', 'chromEnd'] @@ -743,7 +739,7 @@ def _extract_pgx_regions(pharmcat_positions: Path, vcf_file: Path, sample_file: df_regions_to_retain['POS'] = df_regions_to_retain['chromStart'] + '-' + df_regions_to_retain['chromEnd'] ref_pgx_regions = df_regions_to_retain[['CHROM', 'POS']] else: - # obtain PGx regions to be extracted + # get PGx regions to be extracted df_pcat_pos = pd.read_csv(str(pharmcat_positions), compression='gzip', sep="\t", comment='#', header=None) df_pcat_pos = df_pcat_pos[[0, 1]] df_pcat_pos.columns = ['CHROM', 'POS'] @@ -1184,8 +1180,8 @@ def extract_pgx_variants(pharmcat_positions: Path, reference_fasta: Path, vcf_fi def _print_missing_positions(pharmcat_positions: Path, ref_pos_dynamic, output_dir: Path, output_basename: str) -> Path: # report missing positions or positions with all unspecified genotypes in the input VCF missing_pos_file: Path = output_dir / (output_basename + _missing_pgx_var_suffix + '.vcf') - # TODO(markwoon): use colorama to highlight this in next release - print("* Cataloging %d missing positions in %s" % (len(ref_pos_dynamic), missing_pos_file)) + print(Fore.RED + "* Cataloging %d missing positions in %s" % (len(ref_pos_dynamic), missing_pos_file) + + Style.RESET_ALL) with open(missing_pos_file, 'w') as out_f: # get VCF header from pharmcat_positions with gzip.open(pharmcat_positions, mode='rt', encoding='utf-8') as in_f: