Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a new suite of tools for variant filtering based on site-level annotations. #7954

Merged
merged 10 commits into from
Aug 9, 2022
Merged
8 changes: 7 additions & 1 deletion .github/workflows/gatk-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL' ]
wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
continue-on-error: true
name: WDL test ${{ matrix.wdlTest }} on cromwell
steps:
Expand Down Expand Up @@ -349,3 +349,9 @@ jobs:
run: |
echo "Running CNN WDL";
bash scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh;

- name: "VCF_SITE_LEVEL_FILTERING_WDL_TEST"
if: ${{ matrix.wdlTest == 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' }}
run: |
echo "Running VCF Site Level Filtering WDL";
bash scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh;
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ dependencies {

implementation 'org.apache.commons:commons-lang3:3.5'
implementation 'org.apache.commons:commons-math3:3.5'
implementation 'org.hipparchus:hipparchus-stat:2.0'
samuelklee marked this conversation as resolved.
Show resolved Hide resolved
implementation 'org.apache.commons:commons-collections4:4.1'
implementation 'org.apache.commons:commons-vfs2:2.0'
implementation 'org.apache.commons:commons-configuration2:2.4'
Expand Down
1 change: 1 addition & 0 deletions scripts/gatkcondaenv.yml.template
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ dependencies:
- conda-forge::matplotlib=3.2.1
- conda-forge::pandas=1.0.3
- conda-forge::typing_extensions=4.1.1 # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
- conda-forge::dill=0.3.4 # used for pickling lambdas in TrainVariantAnnotationsModel

# core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
- r-base=3.6.2
Expand Down
9 changes: 9 additions & 0 deletions scripts/vcf_site_level_filtering_cromwell_tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Filtering Automated Tests for WDL

**This directory is for GATK devs only**

This directory contains scripts for running Variant Site Level WDL tests in the automated travis build environment.

Please note that this only tests whether the WDL will complete successfully.

Test data is a "plumbing test" using a small portion of a 10 sample callset.
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash -l
set -e
#cd in the directory of the script in order to use relative paths
script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
cd "$script_path"

WORKING_DIR=/home/runner/work/gatk

set -e
echo "Building docker image for VCF Site Level Filtering WDL tests (skipping unit tests)..."

#assume Dockerfile is in root
echo "Building docker without running unit tests... ========="
cd $WORKING_DIR/gatk

# IMPORTANT: This code is duplicated in the cnv and M2 WDL test.
if [ ! -z "$CI_PULL_REQUEST" ]; then
HASH_TO_USE=FETCH_HEAD
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${CI_PULL_REQUEST};
echo "using fetch head:"$HASH_TO_USE
else
HASH_TO_USE=${CI_COMMIT}
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
echo "using travis commit:"$HASH_TO_USE
fi
echo "Docker build done =========="

cd $WORKING_DIR/gatk/scripts/
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json >$WORKING_DIR/vcf_site_level_filtering_travis.json
echo "JSON FILES (modified) ======="
cat $WORKING_DIR/vcf_site_level_filtering_travis.json
echo "=================="


echo "Running Filtering WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
cd $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/
java -jar $CROMWELL_JAR run JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_travis.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"JointVcfFiltering.gatk_docker": "__GATK_DOCKER__",
"JointVcfFiltering.vcf": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz",
"/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz"],
"JointVcfFiltering.vcf_index": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi",
"/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi"],
"JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz",
"JointVcfFiltering.sites_only_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi",
"JointVcfFiltering.basename": "test_10_samples",
"JointVcfFiltering.snp_annotations": "-A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS",
"JointVcfFiltering.indel_annotations": "-A MQRankSum -A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE",
"JointVcfFiltering.model_backend": "PYTHON_IFOREST"
}
273 changes: 273 additions & 0 deletions scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
version 1.0

# This is a workflow for filtering a joint callset VCF using INFO level annotations (so filtering is at the site level).
# Note that the input VCFs here may be sharded by genomic position which may be helpful for large cohorts. The script
# will output the same number of shards that are input.
# This portion of the filtering pipeline will assign a SCORE INFO field annotation to each site, but does not yet apply
# the filtering threshold to the final VCF.

workflow JointVcfFiltering {
input {
Array[File] vcf
Array[File] vcf_index
File sites_only_vcf
File sites_only_vcf_index
String basename

String model_backend
File? training_python_script
File? scoring_python_script
File? hyperparameters_json

String gatk_docker
File? extract_interval_list
File? score_interval_list

String snp_annotations
String indel_annotations
File? gatk_override

String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
}

parameter_meta {
vcf: "An array of input VCFs that are one callset sharded by genomic region."
sites_only_vcf: "The full VCF callset without any genotype or sample level information."
basename: "Desired output file basename."
}

call ExtractVariantAnnotations as ExtractVariantAnnotationsSNPs {
input:
input_vcf = sites_only_vcf,
input_vcf_index = sites_only_vcf_index,
mode = "SNP",
annotations = snp_annotations,
resource_args = snp_resource_args,
basename = basename,
interval_list = extract_interval_list,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}

call ExtractVariantAnnotations as ExtractVariantAnnotationsINDELs {
input:
input_vcf = sites_only_vcf,
input_vcf_index = sites_only_vcf_index,
mode = "INDEL",
annotations = indel_annotations,
resource_args = indel_resource_args,
basename = basename,
interval_list = extract_interval_list,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}

call TrainVariantAnnotationModel as TrainVariantAnnotationModelSNPs {
input:
annots = ExtractVariantAnnotationsSNPs.annots,
basename = basename,
mode = "snp",
model_backend = model_backend,
python_script = training_python_script,
hyperparameters_json = hyperparameters_json,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}

call TrainVariantAnnotationModel as TrainVariantAnnotationModelINDELs {
input:
annots = ExtractVariantAnnotationsINDELs.annots,
basename = basename,
mode = "indel",
model_backend = model_backend,
python_script = training_python_script,
hyperparameters_json = hyperparameters_json,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}

scatter(idx in range(length(vcf))) {
call ScoreVariantAnnotations as ScoreVariantAnnotationsSNPs {
input:
vcf = vcf[idx],
vcf_index = vcf_index[idx],
basename = basename,
mode = "SNP",
model_backend = model_backend,
python_script = scoring_python_script,
annotations = snp_annotations,
extracted_training_vcf = ExtractVariantAnnotationsSNPs.extracted_training_vcf,
extracted_training_vcf_index = ExtractVariantAnnotationsSNPs.extracted_training_vcf_index,
interval_list = score_interval_list,
model_files = TrainVariantAnnotationModelSNPs.outputs,
resource_args = snp_resource_args,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}

call ScoreVariantAnnotations as ScoreVariantAnnotationsINDELs {
input:
vcf = ScoreVariantAnnotationsSNPs.output_vcf,
vcf_index = ScoreVariantAnnotationsSNPs.output_vcf_index,
basename = basename,
mode = "INDEL",
model_backend = model_backend,
python_script = scoring_python_script,
annotations = indel_annotations,
extracted_training_vcf = ExtractVariantAnnotationsINDELs.extracted_training_vcf,
extracted_training_vcf_index = ExtractVariantAnnotationsINDELs.extracted_training_vcf_index,
interval_list = score_interval_list,
model_files = TrainVariantAnnotationModelINDELs.outputs,
resource_args = indel_resource_args,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}
}

output {
Array[File] variant_filtered_vcf = ScoreVariantAnnotationsINDELs.output_vcf
Array[File] variant_filtered_vcf_index = ScoreVariantAnnotationsINDELs.output_vcf_index
}

}

task ExtractVariantAnnotations {
input {
String gatk_docker
File? gatk_override
File input_vcf
File input_vcf_index
String basename
String mode
String annotations
String resource_args
File? interval_list

Int memory_mb = 14000
Int command_mem = memory_mb - 1000
}
Int disk_size = ceil(size(input_vcf, "GB") + 50)
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx~{command_mem}m" \
ExtractVariantAnnotations \
-V ~{input_vcf} \
-O ~{basename}.~{mode} \
~{annotations} \
~{"-L " + interval_list} \
--mode ~{mode} \
~{resource_args}
}
output {
File annots = "~{basename}.~{mode}.annot.hdf5"
File extracted_training_vcf = "~{basename}.~{mode}.vcf.gz"
File extracted_training_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi"
Array[File] outputs = glob("~{basename}.~{mode}.*")
}
runtime {
docker: gatk_docker
disks: "local-disk " + disk_size + " LOCAL"
memory: memory_mb + " MiB"
}
}

task TrainVariantAnnotationModel {
input {
String gatk_docker
File? gatk_override
File annots
String basename
String mode
String model_backend
File? python_script
File? hyperparameters_json

Int memory_mb = 14000
Int command_mem = memory_mb - 1000
}
Int disk_size = ceil(size(annots, "GB") + 100)
command <<<
set -e

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

mode=$(echo "~{mode}" | awk '{print toupper($0)}')

gatk --java-options "-Xmx~{command_mem}m" \
TrainVariantAnnotationsModel \
--annotations-hdf5 ~{annots} \
-O ~{basename} \
--model-backend ~{model_backend} \
~{"--python-script " + python_script} \
~{"--hyperparameters-json " + hyperparameters_json} \
--mode $mode

>>>
output {
Array[File] outputs = glob("~{basename}.~{mode}.*")
}
runtime {
docker: gatk_docker
disks: "local-disk " + disk_size + " LOCAL"
memory: memory_mb + " MiB"
}
}

task ScoreVariantAnnotations {
input {
String gatk_docker
File? gatk_override
File vcf
File vcf_index
String basename
String mode
String model_backend
File? python_script
String annotations
String resource_args
File extracted_training_vcf
File extracted_training_vcf_index
File? interval_list
Array[File] model_files

Int memory_mb = 16000
Int command_mem = memory_mb - 1000
}
Int disk_size = ceil(size(vcf, "GB") *2 + 50)

command {
set -e

ln -s ~{sep=" . && ln -s " model_files} .

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx~{command_mem}m" \
ScoreVariantAnnotations \
~{"-L " + interval_list} \
-V ~{vcf} \
-O ~{basename}.~{mode} \
--model-backend ~{model_backend} \
samuelklee marked this conversation as resolved.
Show resolved Hide resolved
~{"--python-script " + python_script} \
--model-prefix ~{basename} \
~{annotations} \
--mode ~{mode} \
--resource:extracted,extracted=true ~{extracted_training_vcf} \
~{resource_args}
}
output {
File scores = "~{basename}.~{mode}.scores.hdf5"
File annots = "~{basename}.~{mode}.annot.hdf5"
File output_vcf = "~{basename}.~{mode}.vcf.gz"
File output_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi"
}
runtime {
docker: gatk_docker
disks: "local-disk " + disk_size + " LOCAL"
memory: memory_mb + " MiB"
}
}

Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown