Skip to content

Commit

Permalink
Enabled GVCF type filtering support in SelectVariants (#7193)
Browse files Browse the repository at this point in the history
- Added optional argument `--ignore-non-ref-in-types` to support correct handling of VariantContexts that contain a NON_REF allele
- Default behavior does not change
- Note that this only enables correct handling of GVCF input. The filtered output files are VCF (not GVCF) files, since reference blocks are not extended when a variant is filtered out
- Added integration test
  • Loading branch information
michaelgatzen authored Feb 15, 2023
1 parent a88ac4e commit 692e7a3
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,21 @@ public final class VariantTypesVariantFilter implements VariantFilter {
private static final long serialVersionUID = 1L;

private final Set<VariantContext.Type> sampleTypes;
private final boolean ignoreNonRef;

public VariantTypesVariantFilter(Set<VariantContext.Type> includeTypes) {
this(includeTypes, false);
}

public VariantTypesVariantFilter(Set<VariantContext.Type> includeTypes, final boolean ignoreNonRefAlleles) {
Utils.nonNull(includeTypes);
sampleTypes = includeTypes;
ignoreNonRef = ignoreNonRefAlleles;
}

@Override
public boolean test(final VariantContext vc) {
final VariantContext.Type vcSampleType = vc.getType();
final VariantContext.Type vcSampleType = vc.getType(ignoreNonRef);
return sampleTypes.contains(vcSampleType);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ public final class SelectVariants extends VariantWalker {
private static final int MAX_NOCALL_NUMBER_DEFAULT_VALUE = Integer.MAX_VALUE;
private static final double MAX_NOCALL_FRACTION_DEFAULT_VALUE = 1.0;

private static final List<String> GVCF_EXTENSIONS = Arrays.asList(".g.vcf", ".g.vcf.gz", ".gvcf", ".gvcf.gz");

/**
* A site is considered discordant if there exists some sample in the variant track that has a non-reference
* genotype and either the site isn't present in this track, the sample isn't present in this track, or the
Expand Down Expand Up @@ -388,6 +390,16 @@ public final class SelectVariants extends VariantWalker {
doc="Do not select certain type of variants from the input file", optional=true)
private List<VariantContext.Type> typesToExclude = new ArrayList<>();

/**
* When this argument is set, NON_REF alleles will not be considered for the variant type determination. This is
* necessary because every variant in a GVCF file would otherwise be assigned the type MIXED, which makes it
* impossible to filter for e.g. SNPs. If only NON_REF alleles are present at a given site it will still be
* considered SYMBOLIC.
*/
@Argument(fullName="ignore-non-ref-in-types",
doc="If set, NON_REF alleles will be ignored for variant type determination, which is required for filtering GVCF files by type", optional=true)
private boolean ignoreNonRefInTypes = false;

/**
* List of IDs (or a .list file containing ids) to select. The tool will only select variants whose ID
* field is present in this list of IDs. The matching is done by exact string matching. If a file, the file
Expand Down Expand Up @@ -635,6 +647,11 @@ public void onTraversalStart() {
}
}

if (!ignoreNonRefInTypes && (!typesToInclude.isEmpty() || !typesToExclude.isEmpty()) &&
GVCF_EXTENSIONS.stream().anyMatch(extension -> getDrivingVariantsFeatureInput().hasExtension(extension))) {
logger.warn("Filtering by variant type and GVCF input detected, but --ignore-non-ref-in-types argument is not set. Variant types will likely not be filtered correctly. Consider setting this argument for meaningful results.");
}

final Path outPath = vcfOutput.toPath();
vcfWriter = createVCFWriter(outPath);
vcfWriter.writeHeader(new VCFHeader(actualHeaderLines, samples));
Expand Down Expand Up @@ -877,7 +894,7 @@ public void closeTool() {
protected CountingVariantFilter makeVariantFilter() {
CountingVariantFilter compositeFilter = new CountingVariantFilter(VariantFilterLibrary.ALLOW_ALL_VARIANTS);
if (!selectedTypes.isEmpty()) {
compositeFilter = compositeFilter.and(new CountingVariantFilter(new VariantTypesVariantFilter(selectedTypes)));
compositeFilter = compositeFilter.and(new CountingVariantFilter(new VariantTypesVariantFilter(selectedTypes, ignoreNonRefInTypes)));
}

if (rsIDsToKeep != null && !rsIDsToKeep.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,21 @@ public void testVariantTypeSelection() throws IOException {
spec.executeTest("testVariantTypeSelection--" + testFile, this);
}

/**
* Test including variant types in GVCF files.
*/
@Test
public void testVariantTypeSelectionForGVCF() throws IOException {
final String testFile = getToolTestDataDir() + "gvcfExample.g.vcf";

final IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(" --select-type-to-include SNP --ignore-non-ref-in-types ",testFile),
Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_VariantTypeSelectionForGVCF.vcf")
);

spec.executeTest("testVariantTypeSelectionForGVCF--" + testFile, this);
}

/**
* Test excluding indels that are larger than the specified size
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr20,length=64444167>
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1
chr20 69511 . A G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33
chr20 69635 . A T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:10:10,0,119,101,128,229:0,4,0,3
chr20 69785 . T G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:DP:GQ:PL:SB 1/1:80:99:2284,207,0,2287,237,2316:0,1,46,33
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr20,length=64444167,assembly=Homo_sapiens_assembly38.fasta>
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1
chr20 69485 . G <NON_REF> . . BLOCK_SIZE=20;END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800
chr20 69511 . A G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33
chr20 69512 . A <NON_REF> . . BLOCK_SIZE=10;END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800
chr20 69522 . A <NON_REF> . . BLOCK_SIZE=27;END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:95:0:95:0:0,0,0
chr20 69549 . T <NON_REF> . . BLOCK_SIZE=86;END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:6:16:5:16:0,16,90
chr20 69635 . A T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:10:10,0,119,101,128,229:0,4,0,3
chr20 69762 . A <NON_REF> . . BLOCK_SIZE=1;END=69762 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270
chr20 69763 . A <NON_REF> . . BLOCK_SIZE=4;END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:21:7:21:0,21,253
chr20 69767 . A <NON_REF> . . BLOCK_SIZE=4;END=69770 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:12:0,12,180
chr20 69771 . TAAAAA T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:0,0,0:0:20:20,0,119,101,128,229:0,4,0,3
chr20 69777 . G <NON_REF> . . BLOCK_SIZE=10;END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:0:3:0:0,0,0
chr20 69785 . T G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:DP:GQ:PL:SB 1/1:80:99:2284,207,0,2287,237,2316:0,1,46,33

0 comments on commit 692e7a3

Please sign in to comment.