Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vs 1416 modify ingest to correctly handle ploidy differences in dragen 3 7 8 samples #8994

Open
wants to merge 15 commits into
base: ah_var_store
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ workflows:
branches:
- master
- ah_var_store
- VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
tags:
- /.*/
- name: GvsBeta
Expand Down Expand Up @@ -316,6 +317,7 @@ workflows:
- master
- ah_var_store
- vs_1456_status_writes_bug
- VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
tags:
- /.*/
- name: GvsIngestTieout
Expand Down
15 changes: 15 additions & 0 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ workflow GvsAssignIds {
String vcf_header_lines_schema_json = '[{"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}, {"name":"vcf_header_lines","type":"STRING","mode":"REQUIRED"},{"name":"is_expected_unique","type":"BOOLEAN","mode":"REQUIRED"}]'
String sample_vcf_header_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"}, {"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}]'
String sample_load_status_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name":"status","type":"STRING","mode":"REQUIRED"}, {"name":"event_timestamp","type":"TIMESTAMP","mode":"REQUIRED"}]'
String sample_chromosome_ploidy_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name": "chromosome","type": "INTEGER","mode": "REQUIRED"},{"name": "ploidy","type": "INTEGER","mode": "REQUIRED"}]'

if (!defined(git_hash) || !defined(cloud_sdk_docker)) {
call Utils.GetToolVersions {
Expand Down Expand Up @@ -71,6 +72,20 @@ workflow GvsAssignIds {
cloud_sdk_docker = effective_cloud_sdk_docker,
}

call GvsCreateTables.CreateTables as CreateSamplePloidyMapTable {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
datatype = "sample_chromosome_ploidy",
schema_json = sample_chromosome_ploidy_schema_json,
max_table_id = 1,
superpartitioned = "false",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
superpartitioned = "false",
superpartitioned = false,

partitioned = "false",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
partitioned = "false",
partitioned = false,

clustering_field = "sample_id",
cloud_sdk_docker = effective_cloud_sdk_docker,
}

if (load_vcf_headers) {
call GvsCreateTables.CreateTables as CreateScratchVCFHeaderLinesTable {
input:
Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsJointVariantCalling.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ workflow GvsJointVariantCalling {
String destination_project = project_id
String destination_dataset = dataset_name
String fq_temp_table_dataset = "~{destination_project}.~{destination_dataset}"
String ploidy_table_name = "sample_chromosome_ploidy"

if (false) {
Int extract_maxretries_override = ""
Expand Down Expand Up @@ -233,6 +234,7 @@ workflow GvsJointVariantCalling {
is_wgs = is_wgs,
maximum_alternate_alleles = maximum_alternate_alleles,
target_interval_list = target_interval_list,
ploidy_table_name = ploidy_table_name,
}

output {
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ task GetToolVersions {
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-10-17-alpine-6dd528be8d8d"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_10_10-gatkbase-1cd1f9652cb9"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-24-gatkbase-b29b46ab0443"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package org.broadinstitute.hellbender.tools.gvs.common;

import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Lazy;
import htsjdk.variant.variantcontext.VariantContext;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/**
* Holds static methods for determining whether a particular location falls within a PAR, for the purposes of accurately determining the correct ploidy
* <p>
* Some methods, due to their implementation, make implicit hg38 assumptions. If we choose to support other references, we'll need to update this class.
*/
public class PloidyUtils {
/**
* This copies the PAR definition used in the tool FindMendelianViolations with the addition of the Y regions
* @see picard.vcf.MendelianViolations.FindMendelianViolations
* https://github.com/broadinstitute/picard/blob/63138fc8b7ae33eb0490a8ef707b61befa2f51d4/src/main/java/picard/vcf/MendelianViolations/FindMendelianViolations.java#L203
* Wikipedia data: https://en.wikipedia.org/wiki/Pseudoautosomal_region
*/
private static final Set<String> PSEUDO_AUTOSOMAL_REGIONS_DEFINITION = CollectionUtil.makeSet("X:60001-2699520", "X:154931044-155260560", "chrX:10001-2781479", "chrX:155701383-156030895", "Y:10001-2649520", "Y:59034050-59363566", "chrY:10001-2781479", "chrY:56887903-57217415");

public static Lazy<Set<Interval>> ParIntervals = new Lazy<>(() -> Collections.unmodifiableSet(parseIntervalLists(PSEUDO_AUTOSOMAL_REGIONS_DEFINITION)));

public static boolean doesVariantOverlapPAR(VariantContext variant) {
return doesIntervalOverlapPAR(variant.getContig(), variant.getStart(), variant.getEnd());
}

public static boolean isLocationInPAR(long location) {
// Note: SchemaUtils.decodeContig makes implicit hg38 assumptions. If we support different references, we'll
// need to factor that into how we do our mapping from pure location numbers to contigs and positions
koncheto-broad marked this conversation as resolved.
Show resolved Hide resolved
return doesIntervalOverlapPAR(SchemaUtils.decodeContig(location), SchemaUtils.decodePosition(location), SchemaUtils.decodePosition(location) + 1);
}

public static boolean doesIntervalOverlapPAR(String chrom, int positionStart, int positionEnd) {
Interval locationAsInterval = new Interval(chrom, positionStart, positionEnd);
for (Interval par : ParIntervals.get()) {
if (par.intersects(locationAsInterval)) {
return true;
}
}
return false;
}

private static Set<Interval> parseIntervalLists(final Set<String> intervalLists){
// Parse the PARs
final Set<Interval> intervals = new HashSet<>(intervalLists.size());
for (final String par : intervalLists) {
final String[] splits1 = par.split(":");
final String[] splits2 = splits1[1].split("-");
intervals.add(new Interval(splits1[0], Integer.parseInt(splits2[0]), Integer.parseInt(splits2[1])));
}
return intervals;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public class SchemaUtils {
CALL_PGT, CALL_PID, CALL_PS);


public static final List<String> SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, GENOTYPE, PLOIDY);
public static final List<String> SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, PLOIDY);

public static final List<String> EXTRACT_REF_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, LENGTH_FIELD_NAME, STATE_FIELD_NAME);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,6 @@
public class ExtractCohortEngine {
private final Logger logger;

// These represent the PARs given for hg38. They are defined differently for other references, so we'll have to store this in
// a better, ref-dependent different manner if/when we get around to supporting other references
// See: https://en.wikipedia.org/wiki/Pseudoautosomal_region
private final List<LongRange> PARRegions = List.of(
new LongRange(23000000010001L, 23000002781479L), // PAR1, X
new LongRange(24000000010001L, 24000002781479L), // PAR1, Y
new LongRange(23000155701383L, 23000156030895L), // PAR2, X
new LongRange(24000056887903L, 24000057217415L)); // PAR2, Y

private final boolean printDebugInformation;
private final int localSortMaxRecordsInRam;
private final List<SimpleInterval> traversalIntervals;
Expand Down Expand Up @@ -307,6 +298,7 @@ public void traverse() {

if (samplePloidyTableRef != null) {
try (StorageAPIAvroReader reader = new StorageAPIAvroReader(samplePloidyTableRef, ploidyTableRestriction, projectID)) {
logger.info("Found ploidy lookup table. Reading it into memory");
for (final GenericRecord queryRow : reader) {
// the key will be a basic joining of chromosome (represented as a location) with sample name
String chromosome = queryRow.get(SchemaUtils.CHROMOSOME).toString();
Expand All @@ -315,6 +307,7 @@ public void traverse() {
samplePloidyMap.put(makePloidyLookupKey(chromosome, sampleId), ploidy);
}
processBytesScanned(reader);
logger.info("Finished reading ploidy table into memory. "+samplePloidyMap.size()+" entries read.");
}
}

Expand Down Expand Up @@ -604,7 +597,7 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
// If we have looked up the ploidy in our table and it says 1, use a haploid reference
// Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case
int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2);
if (ploidy == 2 || isInPAR(location)) {
if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) {
genotypeBuilder.alleles(gtAlleles);
} else if (ploidy == 1) {
genotypeBuilder.alleles(gtHaploidAlleles);
Expand All @@ -627,7 +620,22 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
for (int sampleId = samplesNotEncountered.nextSetBit(0); sampleId >= 0; sampleId = samplesNotEncountered.nextSetBit(sampleId + 1)) {
genotypeBuilder.reset(false);
genotypeBuilder.name(sampleIdToName.get((long) sampleId));
genotypeBuilder.alleles(gtAlleles);

long chromAsPosition = location - SchemaUtils.decodePosition(location);
String key = makePloidyLookupKey(Long.toString(chromAsPosition), Integer.toString(sampleId));

// Logic for determining the correct ploidy for reference data
// If we have no info in the table, the ploidy is explicitly 2, OR we are in a PAR, use diploid reference.
// If we have looked up the ploidy in our table and it says 1, use a haploid reference
// Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case
int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2);
if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) {
genotypeBuilder.alleles(gtAlleles);
} else if (ploidy == 1) {
genotypeBuilder.alleles(gtHaploidAlleles);
} else {
throw new UserException("GVS cannot currently handle extracting with a ploidy of "+ploidy+" as seen at "+SchemaUtils.decodeContig(location)+": "+SchemaUtils.decodePosition(location)+".");
}
genotypeBuilder.GQ(inferredReferenceState.getReferenceGQ());
genotypes.add(genotypeBuilder.make());
}
Expand Down Expand Up @@ -677,16 +685,6 @@ private String makePloidyLookupKey(final String chromosome, final String sampleI
return chromosome+"-"+sampleId;
}

private boolean isInPAR(final long location) {
for (LongRange region : PARRegions) {
if (region.containsLong(location)) {
return true;
}
}
return false;
}


private VariantContext filterSiteByAlleleSpecificVQScore(VariantContext mergedVC,
Map<Allele, Map<Allele, Double>> scoreMap,
Map<Allele, Map<Allele, Double>> vqScoreMap,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,7 @@

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.*;

/**
* Ingest variant walker
Expand All @@ -47,6 +43,7 @@ public final class CreateVariantIngestFiles extends VariantWalker {
private RefCreator refCreator;
private VetCreator vetCreator;
private VcfHeaderLineScratchCreator vcfHeaderLineScratchCreator;
private SamplePloidyCreator samplePloidyCreator;
private LoadStatus loadStatus;

private final Map<String, Boolean> allLineHeaders = new HashMap<>();
Expand Down Expand Up @@ -292,6 +289,10 @@ public void onTraversalStart() {

if (enableReferenceRanges && !refRangesRowsExist) {
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);

// The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes
// directly. If we're not ingesting references, we can't compute and ingest ploidy either
samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName);
}

if (enableVet && !vetRowsExist) {
Expand Down Expand Up @@ -384,6 +385,18 @@ public Object onTraversalSuccess() {
}
// Wait until all data has been submitted and in pending state to commit
refCreator.commitData();

// this is likely an unnecessary check as it currently stands, but it can't hurt to have it in case we
// later separate their creation, throwing the ploidy stuff explicity behind a flag
if (samplePloidyCreator != null) {
try {
samplePloidyCreator.apply(refCreator.getReferencePloidyData(), refCreator.getTotalRefEntries());
} catch (IOException ioe) {
throw new GATKException("Error writing ploidy data", ioe);
}

samplePloidyCreator.commitData();
}
}

if (vetCreator != null) {
Expand All @@ -407,6 +420,9 @@ public void closeTool() {
if (vetCreator != null) {
vetCreator.closeTool();
}
if (samplePloidyCreator != null) {
samplePloidyCreator.closeTool();
}
if (vcfHeaderLineScratchCreator != null) {
vcfHeaderLineScratchCreator.closeTool();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
import org.broadinstitute.hellbender.tools.gvs.common.GQStateEnum;
import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import org.broadinstitute.hellbender.tools.gvs.common.*;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
Expand All @@ -17,9 +14,7 @@

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;


public final class RefCreator {
Expand All @@ -38,6 +33,12 @@ public final class RefCreator {
private static final String PREFIX_SEPARATOR = "_";
private final static String REF_RANGES_FILETYPE_PREFIX = "ref_ranges_";

private Map<String, BitSet> ploidiesPerChromosome = null;
private Map<String, Map<Integer, Long>> ploidiesCountPerChromosome = null;

// for easily calculating percentages later
private long totalRefEntries = 0L;

public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, String tableNumber, Long sampleId) {
if (outputType != CommonCode.OutputType.BQ) return false;
return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
Expand All @@ -50,6 +51,9 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
this.storeCompressedReferences = storeCompressedReferences;
this.gqStatesToIgnore = gqStatesToIgnore;

this.ploidiesPerChromosome = new HashMap<>();
this.ploidiesCountPerChromosome = new HashMap<>();

coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));

try {
Expand Down Expand Up @@ -100,6 +104,31 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
// if we are writing ref ranges, and this is a reference block, write it!
if (writeReferenceRanges) {
if (variant.isReferenceBlock()) {

// Record reference ploidy if this is not in a PAR
if (!PloidyUtils.doesVariantOverlapPAR(variant)) {
// create the bitset for this ploidy if it isn't there
if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) {
ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>());
}
// set the bit for this ploidy so we record having seen it
Map<Integer, Long> ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig());

int ploidy = variant.getMaxPloidy(1);

Long currentCount = 0L;
if (ploidyCounts.containsKey(ploidy)) {
currentCount = ploidyCounts.get(ploidy);
}

// increment counts for this one and put it back
++currentCount;
ploidyCounts.put(ploidy, currentCount);

++totalRefEntries;
}


// break up reference blocks to be no longer than MAX_REFERENCE_BLOCK_SIZE
int localStart = start;
while ( localStart <= end ) {
Expand Down Expand Up @@ -262,6 +291,14 @@ public static Set<GQStateEnum> getGQStateEnumGreaterThan(GQStateEnum s){
return ret;
}

public Map<String, Map<Integer, Long>> getReferencePloidyData() {
return ploidiesCountPerChromosome;
}

public long getTotalRefEntries() {
return totalRefEntries;
}

public void commitData() {
if (outputType == CommonCode.OutputType.BQ) {
if (writeReferenceRanges && refRangesWriter != null) {
Expand Down
Loading
Loading