Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SVCluster tool #7541

Merged
merged 8 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT;

Expand Down Expand Up @@ -232,8 +233,20 @@ public static int getExpectedCopyNumber(final Genotype genotype) {
return VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
}

public Set<String> getCarrierSamples() {
return genotypes.stream().filter(this::isCarrier).map(Genotype::getSampleName).collect(Collectors.toSet());
public Set<String> getCarrierSampleSet() {
return getCarrierSampleStream().collect(Collectors.toSet());
}

public List<Genotype> getCarrierGenotypeList() {
return getCarrierGenotypeStream().collect(Collectors.toList());
}

private Stream<String> getCarrierSampleStream() {
return getCarrierGenotypeStream().map(Genotype::getSampleName);
}

private Stream<Genotype> getCarrierGenotypeStream() {
return genotypes.stream().filter(this::isCarrier);
}

public boolean isDepthOnly() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVCollapser;
import org.broadinstitute.hellbender.tools.sv.cluster.PloidyTable;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;

import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -64,44 +65,32 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
builder.attribute(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, record.getContigB());
builder.attribute(GATKSVVCFConstants.END2_ATTRIBUTE, end2);
}
if (!svtype.equals(StructuralVariantType.BND)) {
if (svtype.equals(StructuralVariantType.INS)) {
builder.attribute(GATKSVVCFConstants.SVLEN, record.getLength());
}
if (svtype.equals(StructuralVariantType.BND) || svtype.equals(StructuralVariantType.INV)) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}

// Generate alleles for DEL genotypes, which can be inferred from expected and actual copy numbers
final List<Genotype> newGenotypes = new ArrayList<>(record.getGenotypes().size());
if (svtype.equals(StructuralVariantType.DEL)) {
for (final Genotype g : record.getGenotypes()) {
Utils.validate(altAlleles.size() == 1, "Encountered deletion with multiple ALT alleles");
Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT),
"Deletion genotype missing " + GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT + " field");
Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT),
"Deletion genotype missing " + GATKSVVCFConstants.COPY_NUMBER_FORMAT + " field");
final int expectedCopyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
final int copyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
final int numAltAlleles = expectedCopyNumber - copyNumber;
Utils.validate(numAltAlleles >= 0, "Invalid copy number " + copyNumber +
" for deletion genotype with expected copy number " + expectedCopyNumber);
final List<Allele> genotypeAlleles = new ArrayList<>(expectedCopyNumber);
for (int i = 0; i < copyNumber; i++) {
genotypeAlleles.add(refAllele);
}
for (int i = copyNumber; i < numAlleles; i++) {
genotypeAlleles.add(altAlleles.get(0));
}
newGenotypes.add(new GenotypeBuilder(g).alleles(genotypeAlleles).make());
}
builder.genotypes(newGenotypes);
if (record.getGenotypes().stream().anyMatch(g -> g.getAlleles().isEmpty())) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels inefficient to stream over the genotypes twice (first to check if any are empty, then to sanitize), even if the anyMatch stream stops at the first match. Why not just always return the mapped stream below? Or do you think the overhead costs of mapping and re-collecting outweigh the benefits?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right, this was a premature optimization on my part. I was thinking it would be relatively rare to have empty alleles and this would make it faster by not having to make a copy of the genotypes. On second thought, I'm not sure this would have a huge effect, and could lead to some puzzling performance inconsistency for different vcfs (e.g. a chrY vcf would usually run slower since it's likely to have a lot of empty genotypes). I've changed it to just simply call sanitize on every genotype.

// htsjdk vcf encoder does not allow genotypes to have empty alleles
builder.genotypes(record.getGenotypes().stream().map(SVCallRecordUtils::sanitizeEmptyGenotype).collect(Collectors.toList()));
} else {
builder.genotypes(record.getGenotypes());
}

return builder;
}

/**
* Adds NO_CALL allele if empty
*/
private static Genotype sanitizeEmptyGenotype(final Genotype g) {
if (g.getAlleles().isEmpty()) {
return new GenotypeBuilder(g).alleles(Collections.singletonList(Allele.NO_CALL)).make();
} else {
return g;
}
}

/**
* Creates a new {@link GenotypesContext} object augmented with the given sample set. Samples with existing
* genotypes are not touched. Samples without genotypes are assigned the provided sets of alleles and attributes.
Expand All @@ -111,24 +100,33 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
* @param attributes attributes to apply to all new genotypes
* @return genotypes augmented with missing samples
*/
public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final GenotypesContext genotypes,
public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final SVCallRecord record,
final Set<String> samples,
final List<Allele> alleles,
final Map<String, Object> attributes) {
Utils.nonNull(genotypes);
final boolean refAlleleDefault,
final PloidyTable ploidyTable) {
Utils.nonNull(record);
Utils.nonNull(samples);
final GenotypesContext genotypes = record.getGenotypes();
final Set<String> missingSamples = Sets.difference(samples, genotypes.getSampleNames());
if (missingSamples.isEmpty()) {
return genotypes;
}
final ArrayList<Genotype> newGenotypes = new ArrayList<>(genotypes.size() + missingSamples.size());
newGenotypes.addAll(genotypes);
final String contig = record.getContigA();
final List<Allele> altAlleles = record.getAltAlleles();
final Allele refAllele = record.getRefAllele();
final boolean isCNV = record.isSimpleCNV();
for (final String sample : missingSamples) {
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sample);
if (attributes != null) {
genotypeBuilder.attributes(attributes);
final int ploidy = ploidyTable.get(sample, contig);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd add some documentation in the method javadoc about how this logic works (and add @param docs for refAlleleDefault and ploidyTable).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

genotypeBuilder.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy);
if (isCNV) {
genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, ploidy);
genotypeBuilder.alleles(CanonicalSVCollapser.getCNVGenotypeAllelesFromCopyNumber(altAlleles, refAllele, ploidy, ploidy));
} else {
genotypeBuilder.alleles(Collections.nCopies(ploidy, refAlleleDefault ? refAllele : Allele.NO_CALL));
}
genotypeBuilder.alleles(alleles);
newGenotypes.add(genotypeBuilder.make());
}
return GenotypesContext.create(newGenotypes);
Expand Down Expand Up @@ -393,10 +391,18 @@ public static boolean containsAltAllele(final Genotype g) {
return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele);
}

public static boolean isAltGenotype(final Genotype g) {
return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele);
}

public static boolean isAltAllele(final Allele allele) {
return allele != null && !allele.isNoCall() && !allele.isReference();
}

public static boolean isNonRefAllele(final Allele allele) {
return allele != null && !allele.isReference();
}

// TODO this is sort of hacky but the Allele compareTo() method doesn't give stable ordering
public static List<Allele> sortAlleles(final Collection<Allele> alleles) {
return alleles.stream().sorted(Comparator.nullsFirst(Comparator.comparing(Allele::getDisplayString))).collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {

// In the single-sample case, match copy number strictly if we're looking at the same sample
// TODO repeated check for CN attributes in hasSampleOverlap and getCarrierSamples
final Set<String> carriersA = a.getCarrierSamples();
final Set<String> carriersB = b.getCarrierSamples();
final Set<String> carriersA = a.getCarrierSampleSet();
final Set<String> carriersB = b.getCarrierSampleSet();
if (carriersA.size() == 1 && carriersA.equals(carriersB)) {
final Genotype genotypeA = a.getGenotypes().get(carriersA.iterator().next());
final Genotype genotypeB = b.getGenotypes().get(carriersB.iterator().next());
Expand Down Expand Up @@ -90,6 +90,9 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {
@Override
public int getMaxClusterableStartingPosition(final SVCallRecord call) {
final int contigLength = dictionary.getSequence(call.getContigA()).getSequenceLength();
if (!call.isSimpleCNV()) {
return 0;
}
final int maxTheoreticalStart = (int) Math.floor((call.getPositionB() + paddingFraction * (call.getLength() + contigLength)) / (1.0 + paddingFraction));
return Math.min(maxTheoreticalStart, dictionary.getSequence(call.getContigA()).getSequenceLength());
}
Expand Down
Loading