fix(preprocessor): remove -refRegion argument from preprocessor

PharmGKB · Nov 12, 2024 · 5331c2b · 5331c2b
1 parent f429762
commit 5331c2b
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 97 deletions.
diff --git a/docs/using/Running-PharmCAT-Pipeline.md b/docs/using/Running-PharmCAT-Pipeline.md
@@ -39,7 +39,7 @@ Standard use case:
 ```
 usage: pharmcat_pipeline [-s <samples> | -S <txt_file>]
                          [-0] [--absent-to-ref] [-unspecified-to-ref] [-G] 
-                         [-R] [-refRegion <bed_file>]
+                         [-R <bed_file>]
                          [-matcher] [-ma] [-matcherHtml] [-research <type>]
                          [-phenotyper]
                          [-reporter] [-rs <sources>] [-re] [-reporterJson]
@@ -73,13 +73,9 @@ Preprocessor arguments:
                         Assume unspecified genotypes ./. as 0/0 when every sample is './.'. DANGEROUS!
   -G, --no-gvcf-check                
                         Bypass the gVCF check for the input VCF.
-  -R, --retain-specific-regions      
-                        Retain all variants in genomic regions of interest in VCF.
-                        Defaults to entire region of genes covered by PharmCAT.
+  -R `<bed_file>`, --retain-specific-regions `<bed_file>`      
+                        A sorted .bed file indicating regions to retain in VCF.
                         For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down.
-  -refRegion `<bed_file>`, --reference-regions-to-retain `<bed_file>`
-                        A sorted bed file of specific PGx regions to retain.
-                        Use with the `-R` to specify custom regions.
 
 Named allele matcher arguments:
   -matcher              Run named allele matcher independently.

diff --git a/docs/using/VCF-Preprocessor.md b/docs/using/VCF-Preprocessor.md
@@ -154,11 +154,9 @@ look for `pharmcat_positions.vcf.bgz` under the current working directory. You c
 : The [GRCh38.p13](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39/) FASTA file. The FASTA file can be either decompressed or compressed but has to be indexed (.fai and, in addition, .gzi for the compressed file). We recommended the compressed reference genome FASTA file for the sake of storage. These mandatory files will be automatically downloaded (~0.9 GB) to the same directory
 as the reference PGx VCF file (`-refVcf`) if not provided by user (see [Notes](#notes) for details).
 
--R <span class="altArg"><br />or --retain-specific-regions</span>
-: Retain the genomic regions specified by \'-refRegion\'
-
--refRegion `<bed_file>` <span class="altArg"><br />or --reference-regions-to-retain `<bed_file>`</span>
-: A sorted bed file of specific PGx regions to retain. Must be used with the `-R` argument.
+-R `<bed_file>` <span class="altArg"><br />or --retain-specific-regions `<bed_file>`</span>
+: A sorted .bed file indicating regions to retain in VCF.
+For research use only. Additional variants are not used by PharmCAT and will slow PharmCAT down.
 
 -bcftools `</path/to/bcftools>` <span class="altArg"><br />or --path-to-bcftools `</path/to/bcftools>`</span>
 : bcftools must be installed. This argument is optional if bcftools is available in your PATH.

diff --git a/preprocessor/pharmcat_pipeline b/preprocessor/pharmcat_pipeline
@@ -82,17 +82,11 @@ if __name__ == '__main__':
                                          "DANGEROUS!")
     preprocessor_group.add_argument("-G", "--no-gvcf-check", action="store_true",
                                     help="Bypass check if VCF file is in gVCF format.")
-
-    adv_preprocessor_group = parser.add_argument_group('Advanced preprocessor arguments (research only)')
-    adv_preprocessor_group.add_argument("-R", "--retain-specific-regions", action="store_true",
-                                        help='Retain all variants in genomic regions of interest in VCF. '
-                                             'Defaults to entire region of genes covered by PharmCAT. '
-                                             'For research use only. '
-                                             'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
-    adv_preprocessor_group.add_argument("-refRegion", "--reference-regions-to-retain",
-                                        type=str, metavar='<bed_file>',
-                                        help='A sorted .bed file of PGx regions to retain. '
-                                             'Use with "-R" to specify custom regions.')
+    preprocessor_group.add_argument("-R", "--retain-specific-regions",
+                                    type=str, metavar='<bed_file>',
+                                    help='A sorted .bed file indicating regions to retain in VCF. '
+                                         'For research use only. '
+                                         'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
 
     # matcher args
     matcher_group = parser.add_argument_group('Named allele matcher arguments')
@@ -217,21 +211,10 @@ lead to different results.
             print('Downloading reference FASTA.  This may take a while...')
             m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent,
                                                                                  verbose=args.verbose)
-        # make sure we have the pharmcat_regions.bed file if the user choose to retain regions
+        # validate .bed file if retaining specific regions
+        m_regions_bed: Optional[Path] = None
         if args.retain_specific_regions:
-            m_retain_specific_regions: bool = True
-            if args.reference_regions_to_retain:
-                m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain)
-            else:
-                m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME,
-                                                                list({script_dir}))
-                if m_pharmcat_regions_bed is None:
-                    print('Downloading %s...' % preprocessor.PHARMCAT_REGIONS_FILENAME)
-                    m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files(
-                        script_dir, download_region_bed=True, verbose=args.verbose)
-        else:
-            m_retain_specific_regions: bool = False
-            m_pharmcat_regions_bed = None
+            m_regions_bed = preprocessor.validate_file(args.retain_specific_regions)
 
         # prep pharmcat_positions helper files
         preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose)
@@ -315,8 +298,7 @@ lead to different results.
                 keep_intermediate_files=False,
                 absent_to_ref=m_absent_to_ref,
                 unspecified_to_ref=m_unspecified_to_ref,
-                retain_specific_regions=m_retain_specific_regions,
-                reference_regions_to_retain=m_pharmcat_regions_bed,
+                reference_regions_to_retain=m_regions_bed,
                 concurrent_mode=True,
                 max_processes=m_max_processes,
                 verbose=args.verbose,
@@ -387,8 +369,7 @@ lead to different results.
             keep_intermediate_files=False,
             absent_to_ref=m_absent_to_ref,
             unspecified_to_ref=m_unspecified_to_ref,
-            retain_specific_regions=m_retain_specific_regions,
-            reference_regions_to_retain=m_pharmcat_regions_bed,
+            reference_regions_to_retain=m_regions_bed,
             concurrent_mode=True,
             max_processes=m_max_processes,
             verbose=args.verbose,

diff --git a/preprocessor/pharmcat_vcf_preprocessor.py b/preprocessor/pharmcat_vcf_preprocessor.py
@@ -5,7 +5,7 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from typing import List
+from typing import List, Optional
 
 import preprocessor
 
@@ -63,14 +63,11 @@
                                      'or the directory the preprocessor is in.')
     advanced_group.add_argument("-refFna", "--reference-genome", type=str, metavar='<fna_file>',
                                 help="The Human Reference Genome GRCh38/hg38 in the fasta format.")
-    advanced_group.add_argument("-R", "--retain-specific-regions", action="store_true",
-                                help='Retain all variants in genomic regions of interest in VCF. '
-                                     'Defaults to entire region of genes covered by PharmCAT. '
+    advanced_group.add_argument("-R", "--retain-specific-regions",
+                                type=str, metavar='<bed_file>',
+                                help='A sorted .bed file indicating regions to retain in VCF. '
                                      'For research use only. '
                                      'Additional variants are not used by PharmCAT and will slow PharmCAT down.')
-    advanced_group.add_argument("-refRegion", "--reference-regions-to-retain", type=str, metavar='<bed_file>',
-                                help='A sorted .bed file of PGx regions to retain. '
-                                     'Use with "-R" to specify custom regions.')
     advanced_group.add_argument("-bcftools", "--path-to-bcftools", type=str, metavar='</path/to/bcftools>',
                                 help="Path to the bcftools program.  Defaults to bcftools in PATH.")
     advanced_group.add_argument("-bgzip", "--path-to-bgzip", type=str, metavar='</path/to/bgzip>',
@@ -162,22 +159,10 @@
                 m_reference_genome = preprocessor.download_reference_fasta_and_index(m_pharmcat_positions_vcf.parent,
                                                                                      verbose=args.verbose)
 
-        # make sure we have pharmcat_regions.bed
+        # validate .bed file if retaining specific regions
+        m_regions_bed: Optional[Path] = None
         if args.retain_specific_regions:
-            m_retain_specific_regions: bool = True
-            m_pharmcat_regions_bed: Path
-            if args.reference_regions_to_retain:
-                m_pharmcat_regions_bed = preprocessor.validate_file(args.reference_regions_to_retain)
-            else:
-                m_pharmcat_regions_bed = preprocessor.find_file(preprocessor.PHARMCAT_REGIONS_FILENAME,
-                                                                list({Path.cwd(), script_dir}))
-                if m_pharmcat_regions_bed is None:
-                    print('Downloading pharmcat_regions.bed...')
-                    m_pharmcat_regions_bed = preprocessor.download_pharmcat_accessory_files(
-                        script_dir, download_region_bed=True, verbose=args.verbose)
-        else:
-            m_retain_specific_regions: bool = False
-            m_pharmcat_regions_bed = None
+            m_regions_bed = preprocessor.validate_file(args.retain_specific_regions)
 
         # prep pharmcat_positions helper files
         preprocessor.prep_pharmcat_positions(m_pharmcat_positions_vcf, m_reference_genome, verbose=args.verbose)
@@ -275,8 +260,7 @@
                                           keep_intermediate_files=args.keep_intermediate_files,
                                           absent_to_ref=m_absent_to_ref,
                                           unspecified_to_ref=m_unspecified_to_ref,
-                                          retain_specific_regions=m_retain_specific_regions,
-                                          reference_regions_to_retain=m_pharmcat_regions_bed,
+                                          reference_regions_to_retain=m_regions_bed,
                                           concurrent_mode=args.concurrent_mode,
                                           max_processes=m_max_processes,
                                           verbose=args.verbose,

diff --git a/preprocessor/preprocessor/preprocess.py b/preprocessor/preprocessor/preprocess.py
@@ -11,7 +11,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                output_dir: Path, output_basename: Optional[str] = '', split_samples: bool = False,
                keep_intermediate_files: bool = False,
                absent_to_ref: bool = False, unspecified_to_ref: bool = False,
-               retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+               reference_regions_to_retain: Path = None,
                concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]:
     """
     Normalize and prepare the input VCF for PharmCAT.
@@ -34,7 +34,7 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                                   vcf_files, samples,
                                   output_dir, basename,
                                   keep_intermediate_files, absent_to_ref, unspecified_to_ref,
-                                  retain_specific_regions, reference_regions_to_retain,
+                                  reference_regions_to_retain,
                                   concurrent_mode, max_processes, verbose)
     if split_samples and len(samples) > 1:
         util.index_vcf(multisample_vcf, verbose)
@@ -54,8 +54,9 @@ def preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
 def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Path,
                               vcf_files: List[Path], samples: Optional[List[str]],
                               output_dir: Path, output_basename: Optional[str] = '',
-                              keep_intermediate_files: bool = False, missing_to_ref: bool = False,
-                              retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+                              keep_intermediate_files: bool = False,
+                              absent_to_ref: bool = False, unspecified_to_ref: bool = False,
+                              reference_regions_to_retain: Path = None,
                               concurrent_mode: bool = False, max_processes: int = 1, verbose: int = 0) -> List[Path]:
     """
     Normalize and prepare the input VCF for PharmCAT.
@@ -70,7 +71,7 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa
         if samples is None or len(samples) == 0:
             file_samples = util.read_vcf_samples(file, verbose=verbose)
         else:
-            # make sure samples are in vcf file
+            # make sure samples are in the VCF file
             vcf_samples = util.read_vcf_samples(vcf_files[0], verbose=verbose)
             for sample in samples:
                 if sample in vcf_samples:
@@ -82,8 +83,9 @@ def preprocess_multiple_files(pharmcat_positions_vcf: Path, reference_genome: Pa
         multisample_vcf = _preprocess(pharmcat_positions_vcf, reference_genome,
                                       [file], file_samples,
                                       output_dir, basename,
-                                      keep_intermediate_files, missing_to_ref,
-                                      retain_specific_regions, reference_regions_to_retain,
+                                      keep_intermediate_files,
+                                      absent_to_ref, unspecified_to_ref,
+                                      reference_regions_to_retain,
                                       concurrent_mode, max_processes, verbose)
         results.append(finalize_multisample_vcf(multisample_vcf, output_dir, basename))
 
@@ -95,14 +97,14 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                 output_dir: Path, output_basename: Optional[str] = '',
                 keep_intermediate_files: bool = False,
                 absent_to_ref: bool = False, unspecified_to_ref: bool = False,
-                retain_specific_regions: bool = False, reference_regions_to_retain: Path = None,
+                reference_regions_to_retain: Path = None,
                 concurrent_mode=False, max_processes=1, verbose: int = 0) -> Path:
 
     # shrink input VCF down to PGx allele defining regions and selected samples
-    # modify input VCF chromosomes naming format to <chr##>
+    # standardize chromosome names to <chr##>
     pgx_region_vcf: Path = util.extract_pgx_regions(pharmcat_positions_vcf, vcf_files, samples,
                                                     output_dir, output_basename,
-                                                    retain_specific_regions, reference_regions_to_retain,
+                                                    reference_regions_to_retain,
                                                     concurrent_mode=concurrent_mode, max_processes=max_processes,
                                                     verbose=verbose)
     # normalize the input VCF
@@ -114,7 +116,7 @@ def _preprocess(pharmcat_positions_vcf: Path, reference_genome: Path,
                                                        output_dir, output_basename,
                                                        absent_to_ref=absent_to_ref,
                                                        unspecified_to_ref=unspecified_to_ref,
-                                                       retain_specific_regions=retain_specific_regions,
+                                                       retain_specific_regions=bool(reference_regions_to_retain),
                                                        verbose=verbose)
 
     if not keep_intermediate_files: