Skip to content

Commit

Permalink
issue #818 - don't uniq on preprocess anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed May 9, 2024
1 parent 55a786f commit 7aecaa5
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions upload/vcf/vcf_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
VCF_CLEAN_AND_FILTER_SUB_STEP = "vcf_clean_and_filter"
DECOMPOSE_SUB_STEP = "decompose"
NORMALIZE_SUB_STEP = "normalize"
UNIQ_SUB_STEP = "uniq"
REMOVE_HEADER_SUB_STEP = "remove_header"
SPLIT_VCF_SUB_STEP = "split_vcf"

Expand Down Expand Up @@ -109,7 +108,9 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
# VT isn't the bottleneck here, it's my programs - so no speed advantage to using "+" for Uncompressed BCF streams
pipe_commands[DECOMPOSE_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "decompose", "-s", "-"]
pipe_commands[NORMALIZE_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "normalize", "-n", "-r", genome_build.reference_fasta, "-"]
pipe_commands[UNIQ_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "uniq", "-"]
# We don't run 'uniq' anymore as neither Vt or Bcftools handle SVLEN properly (so removed from sub_step_name loop below)
# @see https://github.com/SACGF/variantgrid/issues/818
# pipe_commands[UNIQ_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "uniq", "-"]

# Split up the VCF
split_vcf_dir = upload_pipeline.get_pipeline_processing_subdir("split_vcf")
Expand All @@ -122,7 +123,7 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
"--lines", str(settings.VCF_IMPORT_FILE_SPLIT_ROWS),
f"--filter='sh -c \"{{ cat {split_headers_filename}; cat; }} | bgzip -c > {split_vcf_dir}/$FILE\"'"]

for sub_step_name in [DECOMPOSE_SUB_STEP, NORMALIZE_SUB_STEP, UNIQ_SUB_STEP]:
for sub_step_name in [DECOMPOSE_SUB_STEP, NORMALIZE_SUB_STEP]:
sub_step_commands = pipe_commands[sub_step_name]
sub_steps[sub_step_name] = create_sub_step(upload_step, sub_step_name, sub_step_commands)

Expand Down

0 comments on commit 7aecaa5

Please sign in to comment.