Skip to content

Commit

Permalink
Added MD5/SHA1/SHA256 options for verifying FASTQ
Browse files Browse the repository at this point in the history
File hashes will be verified at the same time as running fastq-check to
validate the records / gzip.
  • Loading branch information
mbreese committed Apr 13, 2019
1 parent 6920bad commit 05aa11e
Showing 1 changed file with 189 additions and 11 deletions.
200 changes: 189 additions & 11 deletions src/java/io/compgen/ngsutils/cli/fastq/FastqCheck.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
package io.compgen.ngsutils.cli.fastq;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.math.BigInteger;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Iterator;
import java.util.zip.GZIPOutputStream;

Expand All @@ -12,6 +18,7 @@
import io.compgen.cmdline.annotation.UnnamedArg;
import io.compgen.cmdline.exceptions.CommandArgumentException;
import io.compgen.cmdline.impl.AbstractCommand;
import io.compgen.common.StringLineReader;
import io.compgen.ngsutils.fastq.Fastq;
import io.compgen.ngsutils.fastq.FastqRead;
import io.compgen.ngsutils.fastq.FastqReader;
Expand All @@ -21,15 +28,38 @@ public class FastqCheck extends AbstractCommand {
private String[] filenames;
private String out1Filename;
private String out2Filename;
private boolean colorspace = false;
private boolean colorspace = false;
private boolean gzip = false;

private String digestFilename = null;
private String digest = null;

public FastqCheck() {
}

@UnnamedArg(name = "FILE {FILE2}")
public void setFilename(String[] filenames) throws IOException {
this.filenames = filenames;
}

@Option(name="md5", desc="Read the MD5 sums for the FASTQ file(s) from this file and check them")
public void setMD5(String filename) throws IOException {
this.digestFilename = filename;
this.digest = "MD5";
}

@Option(name="sha1", desc="Read the SHA1 sums for the FASTQ file(s) from this file and check them")
public void setSHA1(String filename) throws IOException {
this.digestFilename = filename;
this.digest = "SHA1";
}

@Option(name="sha256", desc="Read the SHA256 sums for the FASTQ file(s) from this file and check them")
public void setSHA256(String filename) throws IOException {
this.digestFilename = filename;
this.digest = "SHA256";

}

@Option(name="out1", desc="Write all valid reads (R1) to this file (filenames ending in .gz will be compressed)")
public void setOut1(String filename) throws IOException {
Expand All @@ -41,13 +71,18 @@ public void setOut2(String filename) throws IOException {
this.out2Filename = filename;
}

@Option(name="colorspace", desc="Reads are in color-space (default: base-space)")
@Option(name="gz", desc="Output files should be gzip compressed (regardless of suffix)")
public void setGZip(boolean value) {
this.gzip = value;
}

@Option(name="colorspace", desc="Reads are in color-space (default: base-space)")
public void setColorspace(boolean value) {
this.colorspace = value;
}

@Exec
public void exec() throws IOException, CommandArgumentException {
public void exec() throws IOException, CommandArgumentException, NoSuchAlgorithmException {
long[] counts;
if (filenames.length == 1) {
counts = execSingleFile(filenames[0]);
Expand Down Expand Up @@ -98,10 +133,72 @@ protected boolean checkSeqQualLength(FastqRead read) {
return true;
}

protected long[] execPairedFiles(String filename1, String filename2) throws IOException {
protected long[] execPairedFiles(String filename1, String filename2) throws IOException, NoSuchAlgorithmException {
System.err.println("Reading files: "+filename1+", "+filename2);
final FastqReader reader1 = Fastq.open(filename1);
final FastqReader reader2 = Fastq.open(filename2, true);

FileInputStream fis1 = new FileInputStream(filename1);
DigestInputStream dis1 = null;
String targetHash1 = null;
FileInputStream fis2 = new FileInputStream(filename2);
DigestInputStream dis2 = null;
String targetHash2 = null;

final FastqReader reader1;
final FastqReader reader2;

if (digestFilename != null) {

StringLineReader byline = new StringLineReader(digestFilename);
for (String line: byline) {
String[] spl = line.split(" +");
if (spl[1].charAt(0)=='*') {
spl[1] = spl[1].substring(1);
}
// in the first pass, match by an equal filename
if (filename1.equals(spl[1])) {
targetHash1 = spl[0].toLowerCase();
}
// in the second pass, match by just the filename
else if (new File(spl[1]).getName().equals(new File(filename1).getName())) {
targetHash1 = spl[0].toLowerCase();
}
// in the first pass, match by an equal filename
if (filename2.equals(spl[1])) {
targetHash2 = spl[0].toLowerCase();
}
// in the second pass, match by just the filename
else if (new File(spl[1]).getName().equals(new File(filename2).getName())) {
targetHash2 = spl[0].toLowerCase();
}
}
if (targetHash1 == null) {
fis1.close();
fis2.close();
throw new IOException("Can't find a matching filename in digest file: " + digestFilename+", missing "+filename1);
} else if (targetHash2 == null) {
fis1.close();
fis2.close();
throw new IOException("Can't find a matching filename in digest file: " + digestFilename+", missing "+filename2);
}


System.err.println("Expected "+digest+": " + targetHash1 + " " + filename1);
System.err.println("Expected "+digest+": " + targetHash2 + " " + filename2);

dis1 = new DigestInputStream(fis1, MessageDigest.getInstance(digest));
dis2 = new DigestInputStream(fis2, MessageDigest.getInstance(digest));

reader1 = Fastq.open(dis1, null, fis1.getChannel(), filename1);
reader2 = Fastq.open(dis2, null, null, filename2);
} else {
reader1 = Fastq.open(fis1, null, fis1.getChannel(), filename1);
reader2 = Fastq.open(dis2, null, null, filename2);
}



// final FastqReader reader1 = Fastq.open(filename1);
// final FastqReader reader2 = Fastq.open(filename2, true);

Iterator<FastqRead> it1 = reader1.iterator();
Iterator<FastqRead> it2 = reader2.iterator();
Expand All @@ -111,13 +208,13 @@ protected long[] execPairedFiles(String filename1, String filename2) throws IOEx


if (out1Filename != null && out2Filename != null) {
if (out1Filename.endsWith(".gz")) {
if (gzip || out1Filename.endsWith(".gz")) {
out1 = new GZIPOutputStream(new FileOutputStream(out1Filename));
} else {
out1 = new FileOutputStream(out1Filename);
}

if (out2Filename.endsWith(".gz")) {
if (gzip || out2Filename.endsWith(".gz")) {
out2 = new GZIPOutputStream(new FileOutputStream(out2Filename));
} else {
out2 = new FileOutputStream(out2Filename);
Expand Down Expand Up @@ -145,6 +242,14 @@ protected long[] execPairedFiles(String filename1, String filename2) throws IOEx
} else {
reader1.close();
reader2.close();
if (dis1 != null) {
dis1.close();
}
if (dis2 != null) {
dis2.close();
}
fis1.close();
fis2.close();
return new long[]{-1,0};
}
}
Expand All @@ -170,6 +275,31 @@ protected long[] execPairedFiles(String filename1, String filename2) throws IOEx
}
}

if (dis1 != null) {
String result1 = new BigInteger(1, dis1.getMessageDigest().digest()).toString(16);
if (result1.length() % 2 != 0) {
result1 = "0" + result1;
}
result1 = result1.toLowerCase();
String result2 = new BigInteger(1, dis2.getMessageDigest().digest()).toString(16);
if (result2.length() % 2 != 0) {
result2 = "0" + result2;
}
result2 = result2.toLowerCase();

if (!result1.equals(targetHash1)) {
System.err.println("Error in calculating "+digest+" hash! Expected: "+targetHash1+", got: "+result1);
}
if (!result2.equals(targetHash2)) {
System.err.println("Error in calculating "+digest+" hash! Expected: "+targetHash2+", got: "+result2);
}

if (!result1.equals(targetHash1) || !result2.equals(targetHash2)) {
return new long[]{-1, errorCount};
}

}

return new long[] {count, errorCount};
}

Expand Down Expand Up @@ -203,14 +333,48 @@ protected boolean checkSingle(FastqRead one) {
return isGood;
}

protected long[] execSingleFile(String filename) throws IOException {
protected long[] execSingleFile(String filename) throws IOException, NoSuchAlgorithmException {
System.err.println("Reading file: "+filename);
FastqReader reader = Fastq.open(filename);

FileInputStream fis = new FileInputStream(filename);
DigestInputStream dis = null;
String targetHash = null;

FastqReader reader;
if (digestFilename != null) {

StringLineReader byline = new StringLineReader(digestFilename);
for (String line: byline) {
String[] spl = line.split(" +");
if (spl[1].charAt(0)=='*') {
spl[1] = spl[1].substring(1);
}
// in the first pass, match by an equal filename
if (filename.equals(spl[1])) {
targetHash = spl[0].toLowerCase();
}
// in the second pass, match by just the filename
else if (new File(spl[1]).getName().equals(new File(filename).getName())) {
targetHash = spl[0].toLowerCase();
}
}
if (targetHash == null) {
fis.close();
throw new IOException("Can't find a matching filename in "+digest+" file: " + digestFilename);
}


System.err.println("Expected " + digest+": " + targetHash);
dis = new DigestInputStream(fis, MessageDigest.getInstance(digest));
reader = Fastq.open(dis, null, fis.getChannel(), filename);
} else {
reader = Fastq.open(fis, null, fis.getChannel(), filename);
}

OutputStream out1 = null;

if (out1Filename != null) {
if (out1Filename.endsWith(".gz")) {
if (gzip || out1Filename.endsWith(".gz")) {
out1 = new GZIPOutputStream(new FileOutputStream(out1Filename));
} else {
out1 = new FileOutputStream(out1Filename);
Expand Down Expand Up @@ -347,6 +511,20 @@ protected long[] execSingleFile(String filename) throws IOException {
}

reader.close();

if (dis != null) {
BigInteger bi = new BigInteger(1, dis.getMessageDigest().digest());
String result = bi.toString(16);
if (result.length() % 2 != 0) {
result = "0" + result;
}
result = result.toLowerCase();
if (!result.equals(targetHash)) {
System.err.println("Error in calculating "+digest+" hash! Expected: "+targetHash+", got: "+result);
return new long[]{-1, errorCount};
}
}

return new long[]{count, errorCount};
}
}

0 comments on commit 05aa11e

Please sign in to comment.