-
Notifications
You must be signed in to change notification settings - Fork 593
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SVCluster tool #7541
Merged
Merged
SVCluster tool #7541
Changes from 5 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
27d6afa
Implement SVCluster tool; improve clustering backend and tests
mwalker174 2c3631d
Fix compiler warning
mwalker174 9224741
Fix some tests and interval collapsing
mwalker174 500c238
Fix integration test resources dir name
mwalker174 1cdb219
Fix resource path in PloidyTableTest
mwalker174 55a7e20
Address comments
mwalker174 fe7f7ba
Fix compiler warning
mwalker174 0f3004d
Add SV type check to CanonicalSVCollapser
mwalker174 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,10 +7,11 @@ | |
import htsjdk.variant.vcf.VCFConstants; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; | ||
import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVCollapser; | ||
import org.broadinstitute.hellbender.tools.sv.cluster.PloidyTable; | ||
import org.broadinstitute.hellbender.utils.IntervalUtils; | ||
import org.broadinstitute.hellbender.utils.SimpleInterval; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters; | ||
|
||
import java.util.*; | ||
import java.util.stream.Collectors; | ||
|
@@ -64,44 +65,32 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record) | |
builder.attribute(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, record.getContigB()); | ||
builder.attribute(GATKSVVCFConstants.END2_ATTRIBUTE, end2); | ||
} | ||
if (!svtype.equals(StructuralVariantType.BND)) { | ||
if (svtype.equals(StructuralVariantType.INS)) { | ||
builder.attribute(GATKSVVCFConstants.SVLEN, record.getLength()); | ||
} | ||
if (svtype.equals(StructuralVariantType.BND) || svtype.equals(StructuralVariantType.INV)) { | ||
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record)); | ||
} | ||
|
||
// Generate alleles for DEL genotypes, which can be inferred from expected and actual copy numbers | ||
final List<Genotype> newGenotypes = new ArrayList<>(record.getGenotypes().size()); | ||
if (svtype.equals(StructuralVariantType.DEL)) { | ||
for (final Genotype g : record.getGenotypes()) { | ||
Utils.validate(altAlleles.size() == 1, "Encountered deletion with multiple ALT alleles"); | ||
Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT), | ||
"Deletion genotype missing " + GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT + " field"); | ||
Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT), | ||
"Deletion genotype missing " + GATKSVVCFConstants.COPY_NUMBER_FORMAT + " field"); | ||
final int expectedCopyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0); | ||
final int copyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0); | ||
final int numAltAlleles = expectedCopyNumber - copyNumber; | ||
Utils.validate(numAltAlleles >= 0, "Invalid copy number " + copyNumber + | ||
" for deletion genotype with expected copy number " + expectedCopyNumber); | ||
final List<Allele> genotypeAlleles = new ArrayList<>(expectedCopyNumber); | ||
for (int i = 0; i < copyNumber; i++) { | ||
genotypeAlleles.add(refAllele); | ||
} | ||
for (int i = copyNumber; i < numAlleles; i++) { | ||
genotypeAlleles.add(altAlleles.get(0)); | ||
} | ||
newGenotypes.add(new GenotypeBuilder(g).alleles(genotypeAlleles).make()); | ||
} | ||
builder.genotypes(newGenotypes); | ||
if (record.getGenotypes().stream().anyMatch(g -> g.getAlleles().isEmpty())) { | ||
// htsjdk vcf encoder does not allow genotypes to have empty alleles | ||
builder.genotypes(record.getGenotypes().stream().map(SVCallRecordUtils::sanitizeEmptyGenotype).collect(Collectors.toList())); | ||
} else { | ||
builder.genotypes(record.getGenotypes()); | ||
} | ||
|
||
return builder; | ||
} | ||
|
||
/** | ||
* Adds NO_CALL allele if empty | ||
*/ | ||
private static Genotype sanitizeEmptyGenotype(final Genotype g) { | ||
if (g.getAlleles().isEmpty()) { | ||
return new GenotypeBuilder(g).alleles(Collections.singletonList(Allele.NO_CALL)).make(); | ||
} else { | ||
return g; | ||
} | ||
} | ||
|
||
/** | ||
* Creates a new {@link GenotypesContext} object augmented with the given sample set. Samples with existing | ||
* genotypes are not touched. Samples without genotypes are assigned the provided sets of alleles and attributes. | ||
|
@@ -111,24 +100,33 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record) | |
* @param attributes attributes to apply to all new genotypes | ||
* @return genotypes augmented with missing samples | ||
*/ | ||
public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final GenotypesContext genotypes, | ||
public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final SVCallRecord record, | ||
final Set<String> samples, | ||
final List<Allele> alleles, | ||
final Map<String, Object> attributes) { | ||
Utils.nonNull(genotypes); | ||
final boolean refAlleleDefault, | ||
final PloidyTable ploidyTable) { | ||
Utils.nonNull(record); | ||
Utils.nonNull(samples); | ||
final GenotypesContext genotypes = record.getGenotypes(); | ||
final Set<String> missingSamples = Sets.difference(samples, genotypes.getSampleNames()); | ||
if (missingSamples.isEmpty()) { | ||
return genotypes; | ||
} | ||
final ArrayList<Genotype> newGenotypes = new ArrayList<>(genotypes.size() + missingSamples.size()); | ||
newGenotypes.addAll(genotypes); | ||
final String contig = record.getContigA(); | ||
final List<Allele> altAlleles = record.getAltAlleles(); | ||
final Allele refAllele = record.getRefAllele(); | ||
final boolean isCNV = record.isSimpleCNV(); | ||
for (final String sample : missingSamples) { | ||
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sample); | ||
if (attributes != null) { | ||
genotypeBuilder.attributes(attributes); | ||
final int ploidy = ploidyTable.get(sample, contig); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd add some documentation in the method javadoc about how this logic works (and add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
genotypeBuilder.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy); | ||
if (isCNV) { | ||
genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, ploidy); | ||
genotypeBuilder.alleles(CanonicalSVCollapser.getCNVGenotypeAllelesFromCopyNumber(altAlleles, refAllele, ploidy, ploidy)); | ||
} else { | ||
genotypeBuilder.alleles(Collections.nCopies(ploidy, refAlleleDefault ? refAllele : Allele.NO_CALL)); | ||
} | ||
genotypeBuilder.alleles(alleles); | ||
newGenotypes.add(genotypeBuilder.make()); | ||
} | ||
return GenotypesContext.create(newGenotypes); | ||
|
@@ -393,10 +391,18 @@ public static boolean containsAltAllele(final Genotype g) { | |
return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele); | ||
} | ||
|
||
public static boolean isAltGenotype(final Genotype g) { | ||
return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele); | ||
} | ||
|
||
public static boolean isAltAllele(final Allele allele) { | ||
return allele != null && !allele.isNoCall() && !allele.isReference(); | ||
} | ||
|
||
public static boolean isNonRefAllele(final Allele allele) { | ||
return allele != null && !allele.isReference(); | ||
} | ||
|
||
// TODO this is sort of hacky but the Allele compareTo() method doesn't give stable ordering | ||
public static List<Allele> sortAlleles(final Collection<Allele> alleles) { | ||
return alleles.stream().sorted(Comparator.nullsFirst(Comparator.comparing(Allele::getDisplayString))).collect(Collectors.toList()); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It feels inefficient to stream over the genotypes twice (first to check if any are empty, then to sanitize), even if the anyMatch stream stops at the first match. Why not just always return the mapped stream below? Or do you think the overhead costs of mapping and re-collecting outweigh the benefits?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you're right, this was a premature optimization on my part. I was thinking it would be relatively rare to have empty alleles and this would make it faster by not having to make a copy of the genotypes. On second thought, I'm not sure this would have a huge effect, and could lead to some puzzling performance inconsistency for different vcfs (e.g. a chrY vcf would usually run slower since it's likely to have a lot of empty genotypes). I've changed it to just simply call sanitize on every genotype.