From 5331c2b0dd1f2538763dc7fb5fd598698b58fe89 Mon Sep 17 00:00:00 2001
From: Mark Woon <markwoon@gmail.com>
Date: Mon, 11 Nov 2024 18:03:28 -0800
Subject: [PATCH] fix(preprocessor): remove -refRegion argument from
 preprocessor

---
 docs/using/Running-PharmCAT-Pipeline.md   | 10 ++----
 docs/using/VCF-Preprocessor.md            |  8 ++---
 preprocessor/pharmcat_pipeline            | 39 ++++++-----------------
 preprocessor/pharmcat_vcf_preprocessor.py | 32 +++++--------------
 preprocessor/preprocessor/preprocess.py   | 24 +++++++-------
 preprocessor/preprocessor/utilities.py    | 38 ++++++++++------------
 6 files changed, 54 insertions(+), 97 deletions(-)
diff --git a/docs/using/Running-PharmCAT-Pipeline.md b/docs/using/Running-PharmCAT-Pipeline.md
index 98ad5015..96a3d5b6 100644
--- a/docs/using/Running-PharmCAT-Pipeline.md
+++ b/docs/using/Running-PharmCAT-Pipeline.md
@@ -39,7 +39,7 @@ Standard use case:
 ```
 usage: pharmcat_pipeline [-s <samples> | -S <txt_file>]
                          [-0] [--absent-to-ref] [-unspecified-to-ref] [-G] 
-                         [-R] [-refRegion <bed_file>]
+                         [-R <bed_file>]
                          [-matcher] [-ma] [-matcherHtml] [-research <type>]
                          [-phenotyper]
                          [-reporter] [-rs <sources>] [-re] [-reporterJson]
@@ -73,13 +73,9 @@ Preprocessor arguments:
                         Assume unspecified genotypes ./. as 0/0 when every sample is './.'. DANGEROUS!
   -G, --no-gvcf-check                
                         Bypass the gVCF check for the input VCF.
-  -R, --retain-specific-regions      
-                        Retain all variants in genomic regions of interest in VCF.
-                        Defaults to entire region of genes covered by PharmCAT.
+  -R `<bed_file>`, --retain-specific-regions `<bed_file>`      
+                        A sorted .bed file indicating regions to retain in VCF.
                         For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down.
-  -refRegion `<bed_file>`, --reference-regions-to-retain `<bed_file>`
-                        A sorted bed file of specific PGx regions to retain.
-                        Use with the `-R` to specify custom regions.
 
 Named allele matcher arguments:
   -matcher              Run named allele matcher independently.
diff --git a/docs/using/VCF-Preprocessor.md b/docs/using/VCF-Preprocessor.md
index 0ec7e0c4..e962f962 100644
--- a/docs/using/VCF-Preprocessor.md
+++ b/docs/using/VCF-Preprocessor.md
@@ -154,11 +154,9 @@ look for `pharmcat_positions.vcf.bgz` under the current working directory. You c
 : The [GRCh38.p13](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39/) FASTA file. The FASTA file can be either decompressed or compressed but has to be indexed (.fai and, in addition, .gzi for the compressed file). We recommended the compressed reference genome FASTA file for the sake of storage. These mandatory files will be automatically downloaded (~0.9 GB) to the same directory
 as the reference PGx VCF file (`-refVcf`) if not provided by user (see [Notes](#notes) for details).
 
--R <span class="altArg"><br />or --retain-specific-regions</span>
-: Retain the genomic regions specified by \'-refRegion\'
-
--refRegion `<bed_file>` <span class="altArg"><br />or --reference-regions-to-retain `<bed_file>`</span>
-: A sorted bed file of specific PGx regions to retain. Must be used with the `-R` argument.
+-R `<bed_file>` <span class="altArg"><br />or --retain-specific-regions `<bed_file>`</span>
+: A sorted .bed file indicating regions to retain in VCF.
+For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down.
 
 -bcftools `</path/to/bcftools>` <span class="altArg"><br />or --path-to-bcftools `</path/to/bcftools>`</span>
 : bcftools must be installed. This argument is optional if bcftools is available in your PATH.
diff --git a/preprocessor/pharmcat_pipeline b/preprocessor/pharmcat_pipeline
index 2ebc00a9..9892699c 100644
--- a/preprocessor/pharmcat_pipeline
+++ b/preprocessor/pharmcat_pipeline
@@ -82,17 +82,11 @@ if __name__ == '__main__':
                                          "DANGEROUS!")
     preprocessor_group.add_argument("-G", "--no-gvcf-check", action="store_true",
                                     help="Bypass check if VCF file is in gVCF format.")
-
-    adv_preprocessor_group = parser.add_argument_group('Advanced preprocessor arguments (research only)')
-    adv_preprocessor_group.add_argument("-R", "--retain-specific-regions", action="store_true",
-                                        help='Retain all variants in genomic regions of interest in VCF. '
-                                             'Defaults to entire region of genes covered by PharmCAT. '
-                                             'For research use only. '
-                                             'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
-    adv_preprocessor_group.add_argument("-refRegion", "--reference-regions-to-retain",
-                                        type=str, metavar='<bed_file>',
-                                        help='A sorted .bed file of PGx regions to retain. '
-                                             'Use with "-R" to specify custom regions.')
+    preprocessor_group.add_argument("-R", "--retain-specific-regions",
+                                    type=str, metavar='<bed_file>',
+                                    help='A sorted .bed file indicating regions to retain in VCF. '
+                                         'For research use only. '
+                                         'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
 
     # matcher args
     matcher_group = parser.add_argument_group('Named allele matcher arguments')
@@ -217,21 +211,10 @@ lead to different results.
             print('Downloading reference FASTA.  This may take a while...')
             m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent,
                                                                                  verbose=args.verbose)
-        # make sure we have the pharmcat_regions.bed file if the user choose to retain regions
+        # validate .bed file if retaining specific regions
+        m_regions_bed: Optional[Path] = None
         if args.retain_specific_regions:
-            m_retain_specific_regions: bool = True
-            if args.reference_regions_to_retain:
-                m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain)
-            else:
-                m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME,
-                                                                list({script_dir}))
-                if m_pharmcat_regions_bed is None:
-                    print('Downloading %s...' % preprocessor.PHARMCAT_REGIONS_FILENAME)
-                    m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files(
-                        script_dir, download_region_bed=True, verbose=args.verbose)
-        else:
-            m_retain_specific_regions: bool = False
-            m_pharmcat_regions_bed = None
+            m_regions_bed = preprocessor.validate_file(args.retain_specific_regions)
 
         # prep pharmcat_positions helper files
         preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose)
@@ -315,8 +298,7 @@ lead to different results.
                 keep_intermediate_files=False,
                 absent_to_ref=m_absent_to_ref,
                 unspecified_to_ref=m_unspecified_to_ref,
-                retain_specific_regions=m_retain_specific_regions,
-                reference_regions_to_retain=m_pharmcat_regions_bed,
+                reference_regions_to_retain=m_regions_bed,
                 concurrent_mode=True,
                 max_processes=m_max_processes,
                 verbose=args.verbose,
@@ -387,8 +369,7 @@ lead to different results.
             keep_intermediate_files=False,
             absent_to_ref=m_absent_to_ref,
             unspecified_to_ref=m_unspecified_to_ref,
-            retain_specific_regions=m_retain_specific_regions,
-            reference_regions_to_retain=m_pharmcat_regions_bed,
+            reference_regions_to_retain=m_regions_bed,
             concurrent_mode=True,
             max_processes=m_max_processes,
             verbose=args.verbose,
diff --git a/preprocessor/pharmcat_vcf_preprocessor.py b/preprocessor/pharmcat_vcf_preprocessor.py
index de860219..2e4636f3 100644
--- a/preprocessor/pharmcat_vcf_preprocessor.py
+++ b/preprocessor/pharmcat_vcf_preprocessor.py
@@ -5,7 +5,7 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from typing import List
+from typing import List, Optional
 
 import preprocessor
 
@@ -63,14 +63,11 @@
                                      'or the directory the preprocessor is in.')
     advanced_group.add_argument("-refFna", "--reference-genome", type=str, metavar='<fna_file>',
                                 help="The Human Reference Genome GRCh38/hg38 in the fasta format.")
-    advanced_group.add_argument("-R", "--retain-specific-regions", action="store_true",
-                                help='Retain all variants in genomic regions of interest in VCF. '
-                                     'Defaults to entire region of genes covered by PharmCAT. '
+    advanced_group.add_argument("-R", "--retain-specific-regions",
+                                type=str, metavar='<bed_file>',
+                                help='A sorted .bed file indicating regions to retain in VCF. '
                                      'For research use only. '
                                      'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
-    advanced_group.add_argument("-refRegion", "--reference-regions-to-retain", type=str, metavar='<bed_file>',
-                                help='A sorted .bed file of PGx regions to retain. '
-                                     'Use with "-R" to specify custom regions.')
     advanced_group.add_argument("-bcftools", "--path-to-bcftools", type=str, metavar='</path/to/bcftools>',
                                 help="Path to the bcftools program.  Defaults to bcftools in PATH.")
     advanced_group.add_argument("-bgzip", "--path-to-bgzip", type=str, metavar='</path/to/bgzip>',
@@ -162,22 +159,10 @@
                 m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent,
                                                                                      verbose=args.verbose)
 
-        # make sure we have pharmcat_regions.bed
+        # validate .bed file if retaining specific regions
+        m_regions_bed: Optional[Path] = None
         if args.retain_specific_regions:
-            m_retain_specific_regions: bool = True
-            m_pharmcat_regions_bed: Path
-            if args.reference_regions_to_retain:
-                m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain)
-            else:
-                m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME,
-                                                                list({Path.cwd(), script_dir}))
-                if m_pharmcat_regions_bed is None:
-                    print('Downloading pharmcat_regions.bed...')
-                    m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files(
-                        script_dir, download_region_bed=True, verbose=args.verbose)
-        else:
-            m_retain_specific_regions: bool = False
-            m_pharmcat_regions_bed = None
+            m_regions_bed = preprocessor.validate_file(args.retain_specific_regions)
 
         # prep pharmcat_positions helper files
         preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose)
@@ -275,8 +260,7 @@
                                           keep_intermediate_files=args.keep_intermediate_files,
                                           absent_to_ref=m_absent_to_ref,
                                           unspecified_to_ref=m_unspecified_to_ref,
-                                          retain_specific_regions=m_retain_specific_regions,
-                                          reference_regions_to_retain=m_pharmcat_regions_bed,
+                                          reference_regions_to_retain=m_regions_bed,
                                           concurrent_mode=args.concurrent_mode,
                                           max_processes=m_max_processes,
                                           verbose=args.verbose,
diff --git a/preprocessor/preprocessor/preprocess.py b/preprocessor/preprocessor/preprocess.py
index 1f9b8a46..e83d4135 100644
--- a/preprocessor/preprocessor/preprocess.py
+++ b/preprocessor/preprocessor/preprocess.py
@@ -11,7 +11,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                output_dir: Path, output_basename: Optional[str] = '', split_samples: bool = False,
                keep_intermediate_files: bool = False,
                absent_to_ref: bool = False, unspecified_to_ref: bool = False,
-               retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+               reference_regions_to_retain: Path = None,
                concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]:
     """
     Normalize and prepare the input VCF for PharmCAT.
@@ -34,7 +34,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                                   vcf_files, samples,
                                   output_dir, basename,
                                   keep_intermediate_files, absent_to_ref, unspecified_to_ref,
-                                  retain_specific_regions, reference_regions_to_retain,
+                                  reference_regions_to_retain,
                                   concurrent_mode, max_processes, verbose)
     if split_samples and len(samples) > 1:
         util.index_vcf(multisample_vcf, verbose)
@@ -54,8 +54,9 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
 def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Path,
                               vcf_files: List[Path], samples: Optional[List[str]],
                               output_dir: Path, output_basename: Optional[str] = '',
-                              keep_intermediate_files: bool = False, missing_to_ref: bool = False,
-                              retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+                              keep_intermediate_files: bool = False,
+                              absent_to_ref: bool = False, unspecified_to_ref: bool = False,
+                              reference_regions_to_retain: Path = None,
                               concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]:
     """
     Normalize and prepare the input VCF for PharmCAT.
@@ -70,7 +71,7 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa
         if samples is None or len(samples) == 0:
             file_samples = util.read_vcf_samples(file, verbose=verbose)
         else:
-            # make sure samples are in vcf file
+            # make sure samples are in the VCF file
             vcf_samples = util.read_vcf_samples(vcf_files[0], verbose=verbose)
             for sample in samples:
                 if sample in vcf_samples:
@@ -82,8 +83,9 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa
         multisample_vcf = _preprocess(pharmcat_positions_vcf, reference_genome,
                                       [file], file_samples,
                                       output_dir, basename,
-                                      keep_intermediate_files, missing_to_ref,
-                                      retain_specific_regions, reference_regions_to_retain,
+                                      keep_intermediate_files,
+                                      absent_to_ref, unspecified_to_ref,
+                                      reference_regions_to_retain,
                                       concurrent_mode, max_processes, verbose)
         results.append(finalize_multisample_vcf(multisample_vcf, output_dir, basename))
 
@@ -95,14 +97,14 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                 output_dir: Path, output_basename: Optional[str] = '',
                 keep_intermediate_files: bool = False,
                 absent_to_ref: bool = False, unspecified_to_ref: bool = False,
-                retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+                reference_regions_to_retain: Path = None,
                 concurrent_mode=False, max_processes=1, verbose: int = 0) -> Path:
 
     # shrink input VCF down to PGx allele defining regions and selected samples
-    # modify input VCF chromosomes naming format to <chr##>
+    # standardize chromosome names to <chr##>
     pgx_region_vcf: Path = util.extract_pgx_regions(pharmcat_positions_vcf, vcf_files, samples,
                                                     output_dir, output_basename,
-                                                    retain_specific_regions, reference_regions_to_retain,
+                                                    reference_regions_to_retain,
                                                     concurrent_mode=concurrent_mode, max_processes=max_processes,
                                                     verbose=verbose)
     # normalize the input VCF
@@ -114,7 +116,7 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                                                        output_dir, output_basename,
                                                        absent_to_ref=absent_to_ref,
                                                        unspecified_to_ref=unspecified_to_ref,
-                                                       retain_specific_regions=retain_specific_regions,
+                                                       retain_specific_regions=bool(reference_regions_to_retain),
                                                        verbose=verbose)
 
     if not keep_intermediate_files:
diff --git a/preprocessor/preprocessor/utilities.py b/preprocessor/preprocessor/utilities.py
index 9177ed09..affda6a8 100644
--- a/preprocessor/preprocessor/utilities.py
+++ b/preprocessor/preprocessor/utilities.py
@@ -19,6 +19,7 @@
 from urllib.error import HTTPError
 
 import pandas as pd
+from colorama import Fore, Style
 from packaging import version
 
 from . import common
@@ -394,8 +395,8 @@ def read_sample_file(sample_file: Path, verbose: int = 0) -> List[str]:
             if line and not line.startswith("#"):
                 samples.append(line)
     if len(samples) == 0:
-        # TODO(markwoon): use colorama to highlight this in next release
-        print("Warning: No samples found. Will use all samples listed in VCF.")
+        print(Fore.RED + "Warning: No samples found. Will use all samples listed in VCF." +
+              Style.RESET_ALL)
     else:
         validate_samples(samples)
     return samples
@@ -526,20 +527,15 @@ def download_from_url(url: str, download_dir: Path, force_update: bool = False,
         raise InvalidURL(url)
 
 
-def download_pharmcat_accessory_files(download_dir: Union[Path, str], force_update: bool = False,
-                                      download_region_bed: bool = False, verbose: int = 0):
+def download_pharmcat_accessory_files(download_dir: Union[Path, str], force_update: bool = False, verbose: int = 0):
     if not isinstance(download_dir, Path):
         download_dir = Path(download_dir)
     pos_file = download_dir / common.PHARMCAT_POSITIONS_FILENAME
     if pos_file.exists() and not force_update:
         return pos_file
 
-    if download_region_bed:
-        url = 'https://github.com/PharmGKB/PharmCAT/releases/download/v%s/pharmcat_regions_%s.bed' % \
-              (common.PHARMCAT_VERSION, common.PHARMCAT_VERSION)
-    else:
-        url = 'https://github.com/PharmGKB/PharmCAT/releases/download/v%s/pharmcat_positions_%s.vcf.bgz' %\
-            (common.PHARMCAT_VERSION, common.PHARMCAT_VERSION)
+    url = 'https://github.com/PharmGKB/PharmCAT/releases/download/v%s/pharmcat_positions_%s.vcf.bgz' %\
+          (common.PHARMCAT_VERSION, common.PHARMCAT_VERSION)
     try:
         dl_file = download_from_url(url, download_dir, force_update, verbose)
         # use shutil.move instead of rename to deal with cross-device issues
@@ -656,7 +652,7 @@ def _is_valid_chr(vcf_file: Path) -> bool:
 
 def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples: List[str],
                         output_dir: Path, output_basename: str,
-                        retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+                        reference_regions_to_retain: Path = None,
                         concurrent_mode: bool = False, max_processes: int = 1,
                         verbose: int = 0) -> Path:
     """
@@ -678,7 +674,7 @@ def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples
         if len(vcf_files) == 1:
             # this should create pgx_region_vcf_file
             _extract_pgx_regions(pharmcat_positions, vcf_files[0], tmp_sample_file, output_dir, output_basename,
-                                 retain_specific_regions, reference_regions_to_retain, verbose)
+                                 reference_regions_to_retain, verbose)
         else:
             # generate files to be concatenated
             tmp_files: List[Path] = []
@@ -688,16 +684,16 @@ def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples
                     futures = []
                     for vcf_file in vcf_files:
                         futures.append(e.submit(_extract_pgx_regions, pharmcat_positions, vcf_file, tmp_sample_file,
-                                                output_dir, get_vcf_basename(vcf_file), retain_specific_regions,
-                                                reference_regions_to_retain, verbose))
+                                                output_dir, get_vcf_basename(vcf_file), reference_regions_to_retain,
+                                                verbose))
                     concurrent.futures.wait(futures, return_when=ALL_COMPLETED)
                     for future in futures:
                         tmp_files.append(future.result())
             else:
                 for vcf_file in vcf_files:
                     tmp_files.append(_extract_pgx_regions(pharmcat_positions, vcf_file, tmp_sample_file, output_dir,
-                                                          get_vcf_basename(vcf_file), retain_specific_regions,
-                                                          reference_regions_to_retain, verbose))
+                                                          get_vcf_basename(vcf_file), reference_regions_to_retain,
+                                                          verbose))
             # write file names to txt file for bcftools
             tmp_file_list = tmp_dir / 'regions.txt'
             with open(tmp_file_list, 'w+') as w:
@@ -718,7 +714,7 @@ def extract_pgx_regions(pharmcat_positions: Path, vcf_files: List[Path], samples
 
 def _extract_pgx_regions(pharmcat_positions: Path, vcf_file: Path, sample_file: Path, output_dir: Path,
                          output_basename: Optional[str],
-                         retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+                         reference_regions_to_retain: Path = None,
                          verbose: int = 0) -> Path:
     """
     Does the actual work to extract PGx regions from input VCF file(s) into a single VCF file and
@@ -734,7 +730,7 @@ def _extract_pgx_regions(pharmcat_positions: Path, vcf_file: Path, sample_file:
     if idx_file is None:
         index_vcf(bgz_file, verbose)
 
-    if retain_specific_regions:
+    if reference_regions_to_retain:
         df_regions_to_retain = pd.read_csv(reference_regions_to_retain, sep="\t", header=None)
         df_regions_to_retain = df_regions_to_retain[[0, 1, 2]]
         df_regions_to_retain.columns = ['CHROM', 'chromStart', 'chromEnd']
@@ -743,7 +739,7 @@ def _extract_pgx_regions(pharmcat_positions: Path, vcf_file: Path, sample_file:
         df_regions_to_retain['POS'] = df_regions_to_retain['chromStart'] + '-' + df_regions_to_retain['chromEnd']
         ref_pgx_regions = df_regions_to_retain[['CHROM', 'POS']]
     else:
-        # obtain PGx regions to be extracted
+        # get PGx regions to be extracted
         df_pcat_pos = pd.read_csv(str(pharmcat_positions), compression='gzip', sep="\t", comment='#', header=None)
         df_pcat_pos = df_pcat_pos[[0, 1]]
         df_pcat_pos.columns = ['CHROM', 'POS']
@@ -1184,8 +1180,8 @@ def extract_pgx_variants(pharmcat_positions: Path, reference_fasta: Path, vcf_fi
 def _print_missing_positions(pharmcat_positions: Path, ref_pos_dynamic, output_dir: Path, output_basename: str) -> Path:
     # report missing positions or positions with all unspecified genotypes in the input VCF
     missing_pos_file: Path = output_dir / (output_basename + _missing_pgx_var_suffix + '.vcf')
-    # TODO(markwoon): use colorama to highlight this in next release
-    print("* Cataloging %d missing positions in %s" % (len(ref_pos_dynamic), missing_pos_file))
+    print(Fore.RED + "* Cataloging %d missing positions in %s" % (len(ref_pos_dynamic), missing_pos_file) +
+          Style.RESET_ALL)
     with open(missing_pos_file, 'w') as out_f:
         # get VCF header from pharmcat_positions
         with gzip.open(pharmcat_positions, mode='rt', encoding='utf-8') as in_f: