Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not split multiallelic variants #640

Merged
merged 23 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
c86854f
Do not split multiallelics
bartcharbon Sep 9, 2024
550b245
WIP
bartcharbon Oct 16, 2024
15f37c6
Merge branch 'main' of https://github.com/molgenis/vip into feat/mult…
bartcharbon Oct 18, 2024
2196779
WIP multiallelic support
bartcharbon Oct 18, 2024
3267807
Update report and inheritance matcher images
bartcharbon Oct 25, 2024
82f42ee
update md5sums
bartcharbon Oct 25, 2024
1b5e3a1
Fix config
bartcharbon Oct 25, 2024
83d1b90
Update modules/vcf/templates/inheritance.sh
bartcharbon Nov 4, 2024
f92a899
Update modules/vcf/templates/classify.sh
bartcharbon Nov 4, 2024
47cdf2a
Update modules/vcf/templates/classify.sh
bartcharbon Nov 4, 2024
6fa5b15
Update modules/vcf/templates/classify_samples.sh
bartcharbon Nov 4, 2024
7336105
Update modules/vcf/templates/classify.sh
bartcharbon Nov 4, 2024
9dd4ee1
Update modules/vcf/templates/classify.sh
bartcharbon Nov 4, 2024
3209abc
Update modules/vcf/templates/classify_samples.sh
bartcharbon Nov 4, 2024
6cb591c
Update modules/vcf/templates/classify_samples.sh
bartcharbon Nov 4, 2024
299c394
Update modules/vcf/templates/inheritance.sh
bartcharbon Nov 4, 2024
b8e0fa3
Update modules/vcf/templates/classify_samples.sh
bartcharbon Nov 4, 2024
b9ec972
Update modules/vcf/templates/inheritance.sh
bartcharbon Nov 4, 2024
0a1891e
Update modules/vcf/templates/inheritance.sh
bartcharbon Nov 4, 2024
d8d6b9f
update docs
bartcharbon Nov 4, 2024
fcc3bec
update test resource with multiallelic
bartcharbon Nov 4, 2024
fd95ed6
Merge branch 'feat/multiallelic' of https://github.com/molgenis/vip i…
bartcharbon Nov 4, 2024
bae622c
update corner cases test
bartcharbon Nov 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions config/nxf_vcf.config
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ env {
CMD_VEP = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/vep-111.0.sif vep"
CMD_FILTERVEP = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/vep-111.0.sif filter_vep"
CMD_STRANGER = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/stranger-0.8.1_v2.sif stranger"
CMD_VCFREPORT="apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/vcf-report-7.0.2.sif"
CMD_VCFREPORT="apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/vcf-report-7.0.3.sif"
CMD_VCFDECISIONTREE = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/vcf-decision-tree-4.1.4.sif"
CMD_VCFINHERITANCEMATCHER = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/vcf-inheritance-matcher-3.2.0.sif"
CMD_VCFINHERITANCEMATCHER = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/vcf-inheritance-matcher-3.2.1.sif"

// workaround for SAMtools https://github.com/samtools/samtools/issues/1366#issuecomment-769170935
REF_PATH = ":"
Expand Down Expand Up @@ -37,7 +37,7 @@ process {
}

withLabel: 'vcf_report' {
memory = '4GB'
memory = '6GB'
}
}

Expand Down
1 change: 1 addition & 0 deletions docs/home/key_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Features include:
- Long-read sequencing support (Oxford Nanopore, PacBio HiFi)
- Short-read sequencing support (Illumina, both single and paired-end reads)
- Supports GRCh38, supports GRCh37 and T2T via liftover
- Supports multiallelic variants
- Short variant detection
- Structural variant detection
- Short tandem repeat detection
Expand Down
4 changes: 2 additions & 2 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ download_files() {
urls+=("8f6e06847776448e004df8b863571109" "images/straglr-1.4.4_vip_v3.sif")
urls+=("9c69ac645e04b91c8f480289c536429c" "images/stranger-0.8.1_v2.sif")
urls+=("3855ced7eb0c72f283d098d7938d8586" "images/vcf-decision-tree-4.1.4.sif")
urls+=("d2ab19ecccd9c596e2ec07442d1d551d" "images/vcf-inheritance-matcher-3.2.0.sif")
urls+=("14c1904c601cf051ce4c9dc4b0aa4607" "images/vcf-report-7.0.2.sif")
urls+=("f238b75e85e8a097447bad471369d0b2" "images/vcf-inheritance-matcher-3.2.1.sif")
urls+=("87b2d9031b1b8351d2da14dd0095fbea" "images/vcf-report-7.0.3.sif")
urls+=("7bffc236a7c65b2b2e2e5f7d64beaa87" "images/vep-111.0.sif")
urls+=("82be3c18406e7c027ee4cec83a723d71" "nextflow-24.04.2-all")
if [ "${assembly}" == "ALL" ] || [ "${assembly}" == "GRCh37" ]; then
Expand Down
25 changes: 21 additions & 4 deletions modules/vcf/templates/classify.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ classify () {
args+=("-XX:ParallelGCThreads=2")
args+=("-Xmx!{task.memory.toMega() - 512}m")
args+=("-jar" "/opt/vcf-decision-tree/lib/vcf-decision-tree.jar")
args+=("--input" "!{vcf}")
args+=("--input" "!{vcf}_replaced.vcf.gz")
args+=("--metadata" "!{metadata}")
args+=("--config" "!{decisionTree}")
if [ !{annotatePath} -eq 1 ]; then
args+=("--path")
fi

args+=("--output" "!{vcfOut}")
args+=("--output" "!{vcfOut}_replaced.vcf.gz")

${CMD_VCFDECISIONTREE} java "${args[@]}"
}
Expand All @@ -25,7 +25,7 @@ index () {
}


#Workaround for https://github.com/samtools/htsjdk/issues/500https://github.com/samtools/htsjdk/issues/500
#Workaround for https://github.com/samtools/htsjdk/issues/500
store_alt(){
#store ALT headers
zcat "!{vcf}" | sed --quiet --expression='/^##ALT/p' > header.tmp
Expand All @@ -38,15 +38,32 @@ insert_alt(){
#re-insert the ALT headers
f1=$(<header.tmp)
awk -vf1="$f1" '/^#CHROM/{print f1;print;next}1' "!{vcfOut}".tmp | ${CMD_BGZIP} -c > "!{vcfOut}"
#rm header.tmp
fi
}

#Workaround for https://github.com/samtools/htsjdk/issues/1718
replace_cnv_tr(){
zcat "!{vcf}" | awk 'BEGIN{FS=OFS="\t"} {i=0; while(sub(/<CNV:TR>/,"<CNV:TR"++i">",$5));}1' | ${CMD_BGZIP} -c > "!{vcf}_replaced.vcf.gz"
}

restore_cnv_tr(){
zcat "!{vcfOut}_replaced.vcf.gz" | awk 'BEGIN{FS=OFS="\t"} {gsub(/<CNV:TR[0-9]+>/,"<CNV:TR>",$5);}1' | ${CMD_BGZIP} -c > "!{vcfOut}"
}

cleanup(){
rm "!{vcf}_replaced.vcf.gz"
rm "!{vcfOut}_replaced.vcf.gz"
rm header.tmp
}

main () {
store_alt
replace_cnv_tr
classify
restore_cnv_tr
insert_alt
index
cleanup
}

main "$@"
25 changes: 21 additions & 4 deletions modules/vcf/templates/classify_samples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ classify_samples() {
args+=("-XX:ParallelGCThreads=2")
args+=("-Xmx!{task.memory.toMega() - 512}m")
args+=("-jar" "/opt/vcf-decision-tree/lib/vcf-decision-tree.jar")
args+=("--input" "!{vcf}")
args+=("--input" "!{vcf}_replaced.vcf.gz")
args+=("--metadata" "!{metadata}")
args+=("--type" "sample")
args+=("--config" "!{decisionTree}")
Expand All @@ -17,7 +17,7 @@ classify_samples() {
if [ -n "!{probands}" ]; then
args+=("--probands" "!{probands}")
fi
args+=("--output" "!{vcfOut}")
args+=("--output" "!{vcfOut}_replaced.vcf.gz")

${CMD_VCFDECISIONTREE} java "${args[@]}"
}
Expand All @@ -28,7 +28,7 @@ index () {
}


#Workaround for https://github.com/samtools/htsjdk/issues/500https://github.com/samtools/htsjdk/issues/500
#Workaround for https://github.com/samtools/htsjdk/issues/500
store_alt(){
#store ALT headers
zcat "!{vcf}" | sed --quiet --expression='/^##ALT/p' > header.tmp
Expand All @@ -41,15 +41,32 @@ insert_alt(){
#re-insert the ALT headers
f1=$(<header.tmp)
awk -vf1="$f1" '/^#CHROM/{print f1;print;next}1' "!{vcfOut}".tmp | ${CMD_BGZIP} -c > "!{vcfOut}"
#rm header.tmp
fi
}

#Workaround for https://github.com/samtools/htsjdk/issues/1718
replace_cnv_tr(){
zcat "!{vcf}" | awk 'BEGIN{FS=OFS="\t"} {i=0; while(sub(/<CNV:TR>/,"<CNV:TR"++i">",$5));}1' | ${CMD_BGZIP} -c > "!{vcf}_replaced.vcf.gz"
}

restore_cnv_tr(){
zcat "!{vcfOut}_replaced.vcf.gz" | awk 'BEGIN{FS=OFS="\t"} {gsub(/<CNV:TR[0-9]+>/,"<CNV:TR>",$5);}1' | ${CMD_BGZIP} -c > "!{vcfOut}"
}

cleanup(){
rm "!{vcf}_replaced.vcf.gz"
rm "!{vcfOut}_replaced.vcf.gz"
rm header.tmp
}

main() {
replace_cnv_tr
store_alt
classify_samples
restore_cnv_tr
insert_alt
index
cleanup
}

main "$@"
25 changes: 21 additions & 4 deletions modules/vcf/templates/inheritance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ inheritance () {
args+=("-XX:ParallelGCThreads=2")
args+=("-Xmx!{task.memory.toMega() - 512}m")
args+=("-jar" "/opt/vcf-inheritance-matcher/lib/vcf-inheritance-matcher.jar")
args+=("--input" "!{vcf}")
args+=("--output" "!{vcfOut}")
args+=("--input" "!{vcf}_replaced.vcf.gz")
args+=("--output" "!{vcfOut}_replaced.vcf.gz")
if [ -n "!{pedigree}" ]; then
args+=("--pedigree" "!{pedigree}")
fi
Expand All @@ -28,7 +28,7 @@ index () {
${CMD_BCFTOOLS} index --stats "!{vcfOut}" > "!{vcfOutStats}"
}

#Workaround for https://github.com/samtools/htsjdk/issues/500https://github.com/samtools/htsjdk/issues/500
#Workaround for https://github.com/samtools/htsjdk/issues/500
store_alt(){
#store ALT headers
zcat "!{vcf}" | sed --quiet --expression='/^##ALT/p' > header.tmp
Expand All @@ -41,16 +41,33 @@ insert_alt(){
#re-insert the ALT headers
f1=$(<header.tmp)
awk -vf1="$f1" '/^#CHROM/{print f1;print;next}1' "!{vcfOut}".tmp | ${CMD_BGZIP} -c > "!{vcfOut}"
#rm header.tmp
fi
}

#Workaround for https://github.com/samtools/htsjdk/issues/1718
replace_cnv_tr(){
zcat "!{vcf}" | awk 'BEGIN{FS=OFS="\t"} {i=0; while(sub(/<CNV:TR>/,"<CNV:TR"++i">",$5));}1' | ${CMD_BGZIP} -c > "!{vcf}_replaced.vcf.gz"
}

restore_cnv_tr(){
zcat "!{vcfOut}_replaced.vcf.gz" | awk 'BEGIN{FS=OFS="\t"} {gsub(/<CNV:TR[0-9]+>/,"<CNV:TR>",$5);}1' | ${CMD_BGZIP} -c > "!{vcfOut}"
}

cleanup(){
rm "!{vcf}_replaced.vcf.gz"
rm "!{vcfOut}_replaced.vcf.gz"
rm header.tmp
}

main() {
replace_cnv_tr
create_ped
store_alt
inheritance
restore_cnv_tr
insert_alt
index
cleanup
}

main "$@"
3 changes: 1 addition & 2 deletions modules/vcf/templates/normalize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@ set -euo pipefail
normalize () {
local args=()
args+=("norm")
# split multi-allelic sites into bi-allelic records (both SNPs and indels are merged separately into two records)
args+=("--multiallelics" "-both")
# warn when incorrect or missing REF allele is encountered or when alternate allele is non-ACGTN (e.g. structural variant)
args+=("--check-ref" "w")
args+=("--fasta-ref" "!{refSeqPath}")
args+=("--output-type" "z")
args+=("--output" "!{vcfOut}")
args+=("--no-version")
args+=("--old-rec-tag" "OLD_REC") # if variant is normalized, keep the original location in this field
args+=("--threads" "!{task.cpus}")
args+=("!{vcf}")

Expand Down
4 changes: 2 additions & 2 deletions test/suites/cram/resources/single.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
params {
vcf.filter.classes = "LB,VUS,LP,P"
vcf.filter_samples.classes = "U1,U2"
vcf.filter.classes = "B,LB,VUS,LP,P"
vcf.filter_samples.classes = "U1,U2,U3"
}
12 changes: 6 additions & 6 deletions test/suites/vcf/resources/corner_cases.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
##FILTER=<ID=q40,Description="Quality below 40">
##FILTER=<ID=c5,Description="Coverage below 5">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE0
chr1 1701 . T C 93 PASS DP=131 GT 0/1
chr1 1821 . G T 93 PASS DP=133 GT 0/1
chr1 1977 . A G 93 PASS DP=134 GT 0/1
chr1 4013 . T C 93 PASS DP=122 GT 0/1
chr1 1701 . T <CNV:TR>,<CNV:TR> 93 PASS DP=131 GT 1/2
chr1 1821 . G <CNV:TR> 93 PASS DP=133 GT 0/1
chr1 1977 . A G,C 93 PASS DP=134 GT 0/1
chr1 4013 . T C,G 93 PASS DP=122 GT 0/1
chr1 7362 . G C 93 PASS DP=111 GT 0/1
chr1 1455576 . G GGCCGAGGCCCGGGCGCGC 99 PASS . GT 1|1
chr2 321682 . T <DEL> 6 PASS SVTYPE=DEL;END=321887;SVLEN=-205 GT 0/1
chr2 321682 . T A,<DEL> 6 PASS SVTYPE=DEL;END=321887;SVLEN=-205 GT 0/1
chr2 6693415 . T <DEL> 6 PASS SVTYPE=DEL;END=6707844;SVLEN=-14429 GT 0/1
chr2 9959221 . C <DEL> 6 PASS SVTYPE=DEL;END=9959571;SVLEN=-350 GT 0/1
chr2 28983075 . C <DUP:TANDEM> . . END=105279483;SVTYPE=DUP;SVLEN=76296408 GT 1/1
Expand All @@ -36,4 +36,4 @@ chr13 51551934 . C <DEL> 6 PASS SVTYPE=DEL;END=51552263;SVLEN=-329 GT 0/1
chr17 14093499 . T <DEL> PASS . SVTYPE=DEL;END=15486000;SVLEN=1392501 GT 0/1
chrX 53513441 . G <DEL> 6 PASS SVTYPE=DEL;END=53520776;SVLEN=-7335 GT 0/1
chrX 53820721 . T <DUP> 6 PASS SVTYPE=DUP;END=53862221;SVLEN=41500 GT 0/1
chr11 119076999 . C <STR12> . PASS END=119077032;REF=11;RL=33;RU=CGG;REPID=CBL;OLD_MULTIALLELIC=11:119076999:C/<STR12>/<STR20> GT 1/0
chr11 119076999 . C <STR12>,<STR14> . PASS END=119077032;REF=11;RL=33;RU=CGG;REPID=CBL;OLD_MULTIALLELIC=11:119076999:C/<CNV:TR>/<CNV:TR> GT 1/2
Loading