-
Notifications
You must be signed in to change notification settings - Fork 593
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
CompareReferences: a new tool for analyzing the differences between r…
…eferences (#7930) * This tool generates an MD5-keyed table comparing specified references and does an analysis to summarize the differences between the references provided. * Comparisons are made against a "special" reference, specified with the -R argument. Subsequent references to be compared may be specified using the --references-to-compare argument. * The table can be directed to a file or standard output using provided command-line arguments. * A supplementary table keyed by sequence name can be displayed using the --display-sequences-by-name argument; to display only sequence names for which the references are not consistent, run with the --display-only-differing-sequences argument as well. * Comprehensive unit and integration tests First part of #6837
- Loading branch information
Showing
49 changed files
with
7,565 additions
and
0 deletions.
There are no files selected for viewing
238 changes: 238 additions & 0 deletions
238
src/main/java/org/broadinstitute/hellbender/tools/reference/CompareReferences.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
package org.broadinstitute.hellbender.tools.reference; | ||
|
||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.argparser.ExperimentalFeature; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.engine.GATKPath; | ||
import org.broadinstitute.hellbender.engine.GATKTool; | ||
import org.broadinstitute.hellbender.engine.ReferenceDataSource; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.tsv.DataLine; | ||
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; | ||
import org.broadinstitute.hellbender.utils.tsv.TableWriter; | ||
import picard.cmdline.programgroups.ReferenceProgramGroup; | ||
|
||
import java.io.IOException; | ||
import java.io.OutputStreamWriter; | ||
import java.io.Writer; | ||
import java.nio.file.Path; | ||
import java.util.*; | ||
|
||
/** | ||
* Display reference comparison as a tab-delimited table and summarize reference differences. | ||
* | ||
* <p>This tool generates a MD5-keyed table comparing specified references and does an analysis to summarize the differences | ||
* between the references provided. Comparisons are made against a "special" reference, specified with the -R | ||
* argument. Subsequent references to be compared may be specified using the -references-to-compare argument. | ||
* The table can be directed to a file or standard output using provided command-line arguments. | ||
* A supplementary table keyed by sequence name can be displayed using the -display-sequences-by-name argument; | ||
* to display only sequence names for which the references are not consistent, run with the -display-only-differing-sequences | ||
* argument as well.</p> | ||
* | ||
* <h3>Input</h3> | ||
* <p> | ||
* The references and MD5 calculation mode. | ||
* </p> | ||
* | ||
* <h3>Output</h3> | ||
* <p> | ||
* A TSV file or output stream displaying the MD5-keyed table comparison, and a table analysis displayed to standard output. | ||
* <pre> | ||
* MD5 Length hg19mini.fasta hg19mini_1renamed.fasta | ||
* 8c0c38e352d8f3309eabe4845456f274 16000 1 chr1 | ||
* 5f8388fe3fb34aa38375ae6cf5e45b89 16000 2 2 | ||
* 94de808a3a2203dbb02434a47bd8184f 16000 3 3 | ||
* 7d397ee919e379328d8f52c57a54c778 16000 4 4 | ||
* | ||
* REFERENCE PAIR: hg19mini.fasta, hg19mini_1renamed.fasta | ||
* Status: | ||
* DIFFER_IN_SEQUENCE_NAMES | ||
* </pre> | ||
* </p> | ||
* | ||
* <h3>Usage example</h3> | ||
* <pre> | ||
* gatk CompareReferences \ | ||
* -R reference1.fasta \ | ||
* -refcomp reference2.fasta | ||
* -O output.fasta \ | ||
* -md5-calculation-mode USE_DICT | ||
* </pre> | ||
* | ||
*/ | ||
|
||
@CommandLineProgramProperties( | ||
summary = "Compare multiple references and output a tab-delimited table detailing what the differences are and a summarized analysis of each pair of references.", | ||
oneLineSummary = "Display reference comparison as a tab-delimited table and summarize reference differences.", | ||
programGroup = ReferenceProgramGroup.class | ||
) | ||
|
||
@DocumentedFeature | ||
@ExperimentalFeature | ||
public class CompareReferences extends GATKTool { | ||
|
||
@Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.") | ||
private List<GATKPath> references; | ||
|
||
/** | ||
* Output file will be written here. | ||
* | ||
* Note: If no output file provided, table will print to standard output. | ||
*/ | ||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If provided, file to output table.", optional = true) | ||
private GATKPath output; | ||
|
||
@Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "MD5CalculationMode indicating method of MD5 calculation.", optional = true) | ||
private MD5CalculationMode md5CalculationMode = MD5CalculationMode.RECALCULATE_IF_MISSING; | ||
|
||
@Argument(fullName = "display-sequences-by-name", doc = "If provided, the table by sequence name will be printed.", optional = true) | ||
private boolean displaySequencesByName = false; | ||
|
||
@Argument(fullName = "display-only-differing-sequences", doc = "If provided, only display sequence names that differ in their actual sequence.", optional = true) | ||
private boolean onlyDisplayDifferingSequences = false; | ||
|
||
public enum MD5CalculationMode { | ||
// use only MD5s found in dictionary; if MD5 missing, crashes | ||
USE_DICT, | ||
// use any MD5s found in dictionary and recalculate any missing MD5s | ||
RECALCULATE_IF_MISSING, | ||
// recalculate all MD5s, regardless of presence in dictionary | ||
ALWAYS_RECALCULATE; | ||
} | ||
|
||
private Map<GATKPath, ReferenceDataSource> referenceSources; | ||
|
||
@Override | ||
public boolean requiresReference() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public void onTraversalStart() { | ||
// add data source for -R reference | ||
referenceSources = new LinkedHashMap<>(); | ||
referenceSources.put(referenceArguments.getReferenceSpecifier(), directlyAccessEngineReferenceDataSource()); | ||
|
||
// add data sources for remaining references | ||
for(GATKPath path : references){ | ||
referenceSources.put(path, ReferenceDataSource.of(path.toPath())); | ||
} | ||
} | ||
|
||
@Override | ||
public void traverse(){ | ||
ReferenceSequenceTable table = new ReferenceSequenceTable(referenceSources, md5CalculationMode); | ||
logger.info("Building reference sequence table."); | ||
table.build(); | ||
logger.info("Finished building table."); | ||
|
||
writeTable(table); | ||
|
||
if(displaySequencesByName){ | ||
writeTableBySequenceName(table); | ||
} | ||
|
||
logger.info("Analyzing table."); | ||
List<ReferencePair> referencePairs = table.analyzeTable(); | ||
logger.info("Finished analyzing table."); | ||
System.out.println("*********************************************************"); | ||
for(ReferencePair pair : referencePairs){ | ||
System.out.println(pair); | ||
} | ||
} | ||
|
||
|
||
/** | ||
* Given a table, write table to output. | ||
* | ||
* Note: if no output file specified, displays to standard output. | ||
* | ||
* @param table | ||
*/ | ||
private void writeTable(ReferenceSequenceTable table){ | ||
TableColumnCollection columns = new TableColumnCollection(table.getColumnNames()); | ||
try(CompareReferences.CompareReferencesOutputTableWriter writer = output == null | ||
? new CompareReferences.CompareReferencesOutputTableWriter(new OutputStreamWriter(System.out), columns) | ||
: new CompareReferences.CompareReferencesOutputTableWriter(output.toPath(), columns) | ||
){ | ||
writer.writeHeaderIfApplies(); | ||
for(ReferenceSequenceTable.TableRow row : table){ | ||
writer.writeRecord(row); | ||
} | ||
} | ||
catch(IOException exception){ | ||
throw (output == null) | ||
? new UserException.CouldNotCreateOutputFile("System.out", "Failed to write output table.", exception) | ||
: new UserException.CouldNotCreateOutputFile(output, "Failed to write output table.", exception); | ||
} | ||
} | ||
|
||
/** | ||
* Given a table, write table by sequence name to standard output | ||
* | ||
* @param table | ||
*/ | ||
public void writeTableBySequenceName(ReferenceSequenceTable table){ | ||
System.out.print("*********************************************************\n"); | ||
System.out.print("Name \tMD5 \tReference\n"); | ||
for(String sequenceName : table.getAllSequenceNames()){ | ||
Set<ReferenceSequenceTable.TableRow> rows = table.queryBySequenceName(sequenceName); | ||
if(onlyDisplayDifferingSequences) { | ||
if(rows.size() > 1) { | ||
displayBySequenceName(rows, sequenceName); | ||
} | ||
} else { | ||
displayBySequenceName(rows, sequenceName); | ||
} | ||
} | ||
} | ||
|
||
private void displayBySequenceName(Set<ReferenceSequenceTable.TableRow> rows, String sequenceName){ | ||
System.out.print(sequenceName); | ||
for(ReferenceSequenceTable.TableRow row : rows) { | ||
ReferenceSequenceTable.TableEntry[] entries = row.getEntries(); | ||
System.out.print("\n\t" + row.getMd5() + "\t"); | ||
|
||
for(int i = 2; i < entries.length; i++) { | ||
if(entries[i].getColumnValue().equals(sequenceName)) { | ||
System.out.print(entries[i].getColumnName() + "\t"); | ||
} | ||
} | ||
} | ||
System.out.println(); | ||
} | ||
|
||
@Override | ||
public void closeTool() { | ||
for(Map.Entry<GATKPath, ReferenceDataSource> entry : referenceSources.entrySet()){ | ||
if(!entry.getKey().equals(referenceArguments.getReferenceSpecifier())){ | ||
entry.getValue().close(); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* TableWriter to format and write the table output. | ||
*/ | ||
public static class CompareReferencesOutputTableWriter extends TableWriter<ReferenceSequenceTable.TableRow> { | ||
|
||
public CompareReferencesOutputTableWriter(final Path table, TableColumnCollection columns) throws IOException { | ||
super(table, columns); | ||
} | ||
|
||
public CompareReferencesOutputTableWriter(final Writer writer, TableColumnCollection columns) throws IOException { | ||
super(writer, columns); | ||
} | ||
|
||
@Override | ||
protected void composeLine(final ReferenceSequenceTable.TableRow record, final DataLine dataLine) { | ||
List<String> columnNames = record.getColumnNames(); | ||
ReferenceSequenceTable.TableEntry[] entries = record.getEntries(); | ||
for(int i = 0; i < columnNames.size(); i++){ | ||
dataLine.set(entries[i].getColumnName(), entries[i].getColumnValue()); | ||
} | ||
} | ||
} | ||
} |
116 changes: 116 additions & 0 deletions
116
src/main/java/org/broadinstitute/hellbender/tools/reference/ReferencePair.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package org.broadinstitute.hellbender.tools.reference; | ||
|
||
import org.broadinstitute.hellbender.engine.GATKPath; | ||
|
||
import java.util.*; | ||
|
||
/** | ||
* Class representing a pair of references and their differences. | ||
* | ||
*/ | ||
public class ReferencePair { | ||
|
||
private final GATKPath ref1; | ||
|
||
private final GATKPath ref2; | ||
|
||
private final int ref1ColumnIndex; | ||
|
||
private final int ref2ColumnIndex; | ||
|
||
public enum Status{ | ||
// references match exactly (table entries identical across references) | ||
EXACT_MATCH, | ||
// found sequences with same MD5s but inconsistent names between the 2 references | ||
DIFFER_IN_SEQUENCE_NAMES, | ||
// found sequences with same names but inconsistent MD5s between the 2 references | ||
DIFFER_IN_SEQUENCE, | ||
// at least one sequence is present in one reference but not the other | ||
DIFFER_IN_SEQUENCES_PRESENT, | ||
// ref1 is a superset of ref2 | ||
SUPERSET, | ||
// ref1 is a subset of ref2 | ||
SUBSET; | ||
} | ||
|
||
private EnumSet<Status> analysis; | ||
|
||
public ReferencePair(ReferenceSequenceTable table, GATKPath reference1, GATKPath reference2){ | ||
ref1 = reference1; | ||
ref2 = reference2; | ||
ref1ColumnIndex = table.getColumnIndices().get(ReferenceSequenceTable.getReferenceColumnName(ref1)); | ||
ref2ColumnIndex = table.getColumnIndices().get(ReferenceSequenceTable.getReferenceColumnName(ref2)); | ||
// assume EXACT_MATCH until proven otherwise - if any discrepancies found, EXACT_MATCH status removed | ||
analysis = EnumSet.of(Status.EXACT_MATCH); | ||
} | ||
|
||
/** | ||
* Given a Status, add it to the ReferencePair's analysis | ||
* | ||
* @param status | ||
*/ | ||
public void addStatus(Status status) { | ||
analysis.add(status); | ||
} | ||
|
||
/** | ||
* Given a Status, remove it from the ReferencePair's analysis, if present | ||
* | ||
* @param status | ||
*/ | ||
public void removeStatus(Status status){ | ||
analysis.remove(status); | ||
} | ||
|
||
public int getRef1ColumnIndex() { | ||
return ref1ColumnIndex; | ||
} | ||
|
||
public int getRef2ColumnIndex() { | ||
return ref2ColumnIndex; | ||
} | ||
|
||
public String getRef1(){ | ||
return ReferenceSequenceTable.getReferenceColumnName(ref1); | ||
} | ||
public String getRef2(){ | ||
return ReferenceSequenceTable.getReferenceColumnName(ref2); | ||
} | ||
|
||
/** | ||
* Displays a ReferencePair's status set | ||
* | ||
* @return the status set as a formatted String | ||
*/ | ||
public String statusAsString(){ | ||
String output = ""; | ||
for(Status status : analysis){ | ||
output += String.format("\t%s\n", status.name()); | ||
} | ||
return output; | ||
} | ||
|
||
public EnumSet<Status> getAnalysis(){ | ||
return analysis; | ||
} | ||
|
||
public String toString(){ | ||
return String.format("REFERENCE PAIR: %s, %s\nStatus:\n%s", | ||
ReferenceSequenceTable.getReferenceColumnName(ref1), | ||
ReferenceSequenceTable.getReferenceColumnName(ref2), | ||
statusAsString()); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
ReferencePair that = (ReferencePair) o; | ||
return Objects.equals(ref1, that.ref1) && Objects.equals(ref2, that.ref2); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(ref1, ref2); | ||
} | ||
} |
Oops, something went wrong.