Skip to content

Commit

Permalink
Merge pull request #2094 from broadinstitute/cn_vqsr
Browse files Browse the repository at this point in the history
VariantRecalibrator and ApplyVQSR port part 1 (no integration tests).
  • Loading branch information
cmnbroad authored Sep 14, 2016
2 parents e500887 + 082305e commit 4ec1b85
Show file tree
Hide file tree
Showing 37 changed files with 4,141 additions and 113 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,14 @@ public static String encodeValueList(final List<Double> valueList, final String
}
return StringUtils.join(outputList, ",");
}

/**
* Helper function to convert a List of Strings to a comma-separated String
* @param stringList the ArrayList with String data
* @return a comma-separated String
*/
public static String encodeStringList( final List<String> stringList) {
return StringUtils.join(stringList, ",");
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.broadinstitute.hellbender.utils.samples.PedigreeValidationType;
import org.broadinstitute.hellbender.utils.samples.SampleDB;
import org.broadinstitute.hellbender.utils.samples.SampleDBBuilder;
import org.broadinstitute.hellbender.utils.samples.SampleUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.text.XReadLines;
import org.broadinstitute.hellbender.utils.variant.*;
Expand Down Expand Up @@ -487,17 +488,20 @@ public void onTraversalStart() {
IDsToKeep = getIDsFromFile(rsIDFile);
IDsToRemove = getIDsFromFile(XLrsIDFile);

//TODO: this should be refactored/consolidated as part of
// https://github.com/broadinstitute/gatk/issues/121 and
// https://github.com/broadinstitute/gatk/issues/1116
Set<VCFHeaderLine> actualLines = null;
SAMSequenceDictionary sequenceDictionary = null;
if (hasReference()) {
File refFile = referenceArguments.getReferenceFile();
sequenceDictionary= this.getReferenceDictionary();
actualLines = withUpdatedContigsAsLines(headerLines, refFile, sequenceDictionary, suppressReferencePath);
actualLines = VcfUtils.updateHeaderContigLines(headerLines, refFile, sequenceDictionary, suppressReferencePath);
}
else {
sequenceDictionary = getHeaderForVariants().getSequenceDictionary();
if (null != sequenceDictionary) {
actualLines = withUpdatedContigsAsLines(headerLines, null, sequenceDictionary, suppressReferencePath);
actualLines = VcfUtils.updateHeaderContigLines(headerLines, null, sequenceDictionary, suppressReferencePath);
}
else {
actualLines = headerLines;
Expand Down Expand Up @@ -646,7 +650,7 @@ protected VariantFilter makeVariantFilter() {
*/
private SortedSet<String> createSampleNameInclusionList(Map<String, VCFHeader> vcfHeaders) {
final SortedSet<String> vcfSamples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
final Collection<String> samplesFromFile = getSamplesFromFiles(sampleFiles);
final Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
final Collection<String> samplesFromExpressions = matchSamplesExpressions(vcfSamples, sampleExpressions);

// first, check overlap between requested and present samples
Expand Down Expand Up @@ -684,7 +688,7 @@ private SortedSet<String> createSampleNameInclusionList(Map<String, VCFHeader> v
}

// Exclude samples take precedence over include - remove any excluded samples
final Collection<String> XLsamplesFromFile = getSamplesFromFiles(XLsampleFiles);
final Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
final Collection<String> XLsamplesFromExpressions = matchSamplesExpressions(vcfSamples, XLsampleExpressions);
samples.removeAll(XLsamplesFromFile);
samples.removeAll(XLsampleNames);
Expand Down Expand Up @@ -732,7 +736,7 @@ private Set<VariantContext.Type> createSampleTypeInclusionList() {
private Set<VCFHeaderLine> createVCFHeaderLineList(Map<String, VCFHeader> vcfHeaders) {

final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
headerLines.add(new VCFHeaderLine("source", this.getClass().getSimpleName()));

if (keepOriginalChrCounts) {
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY));
Expand All @@ -749,63 +753,6 @@ private Set<VCFHeaderLine> createVCFHeaderLineList(Map<String, VCFHeader> vcfHea
return headerLines;
}

private static Set<VCFHeaderLine> withUpdatedContigsAsLines(
final Set<VCFHeaderLine> oldLines,
final File referenceFile,
final SAMSequenceDictionary refDict,
final boolean referenceNameOnly) {
final Set<VCFHeaderLine> lines = new LinkedHashSet<>(oldLines.size());

for (final VCFHeaderLine line : oldLines) {
if (line instanceof VCFContigHeaderLine) {
continue; // skip old contig lines
}
if (line.getKey().equals(VCFHeader.REFERENCE_KEY)) {
continue; // skip the old reference key
}
lines.add(line);
}

lines.addAll(makeContigHeaderLines(refDict, referenceFile).stream().collect(Collectors.toList()));

if (referenceFile != null) {
final String referenceValue;
if (referenceNameOnly) {
final int extensionStart = referenceFile.getName().lastIndexOf(".");
referenceValue = extensionStart == -1 ? referenceFile.getName() : referenceFile.getName().substring(0, extensionStart);
}
else {
referenceValue = "file://" + referenceFile.getAbsolutePath();
}
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, referenceValue));
}
return lines;
}

/**
* Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null
* @param refDict reference dictionary
* @param referenceFile for assembly name. May be null
* @return list of vcf contig header lines
*/
private static List<VCFContigHeaderLine> makeContigHeaderLines(final SAMSequenceDictionary refDict,
final File referenceFile) {
final List<VCFContigHeaderLine> lines = new ArrayList<>();
final String assembly = referenceFile != null ? referenceFile.getName() : null;
lines.addAll(refDict.getSequences().stream().map(contig -> makeContigHeaderLine(contig, assembly)).collect(Collectors.toList()));
return lines;
}

private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
final Map<String, String> map = new LinkedHashMap<>(3);
map.put("ID", contig.getSequenceName());
map.put("length", String.valueOf(contig.getSequenceLength()));
if (assembly != null) {
map.put("assembly", assembly);
}
return new VCFContigHeaderLine(map, contig.getSequenceIndex());
}

/**
* Entry-point function to initialize the samples database from input data
*/
Expand All @@ -831,26 +778,6 @@ private static Collection<String> matchSamplesExpressions (Collection<String> or
return samples;
}

/**
* Given a list of files with sample names it reads all files and creates a list of unique samples from all these files.
* @param files list of files with sample names in
* @return a collection of unique samples from all files
*/
private static Collection<String> getSamplesFromFiles (Collection<File> files) {
final Set<String> samplesFromFiles = new LinkedHashSet<>();
if (files != null) {
for (final File file : files) {
try (XReadLines reader = new XReadLines(file)) {
List<String> lines = reader.readLines();
samplesFromFiles.addAll(lines.stream().collect(Collectors.toList()));
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, e);
}
}
}
return samplesFromFiles;
}

/**
* Get IDs from a file
*
Expand Down
Loading

0 comments on commit 4ec1b85

Please sign in to comment.