Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stdout changes #22

Merged
merged 1 commit into from
Apr 10, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
*.so
mergeReads
nxtrim
test

version.h
6 changes: 5 additions & 1 deletion Changelog
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
2016.04.10
*added --stdout-mp and --stdout-un functionality
*changed the --norc flag to --rf

2016.03.27
*fixed a bug that occurs when comment field is empty issue #17

2016.02.27
*fixed issue #15
*removed BOOST dependencies
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ debug: all

GIT_HASH := $(shell git describe --abbrev=4 --always )

VERSION = v0.4.0
VERSION = v0.4.1
GIT_VERSION =
ifneq "$(wildcard .git)" ""
GIT_VERSION = -$(shell git describe --always)
Expand Down
20 changes: 13 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,14 @@ make
```

####Usage
Trim the data:
```
nxtrim -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz -O sample
```

Assemble with [Velvet](https://www.ebi.ac.uk/~zerbino/velvet/):
Trimming and assembly with [Velvet](https://www.ebi.ac.uk/~zerbino/velvet/):
```
nxtrim -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz -O sample
velveth output_dir 55 -short -fastq.gz sample.se.fastq.gz -shortPaired2 -fastq.gz sample.pe.fastq.gz -shortPaired3 -fastq.gz sample.mp.fastq.gz -shortPaired4 -fastq.gz sample.unknown.fastq.gz
velvetg output_dir -exp_cov auto -cov_cutoff auto -shortMatePaired4 yes
```
the above approach corresponds to the results in the NxTrim publication.

Trimming and assembly with [SPAdes](http://bioinf.spbau.ru/spades):
```
Expand All @@ -36,7 +34,15 @@ spades.py -t 4 --hqmp1-12 sample.allmp.fastq.gz -o output_dir
```
We concatenate the unknown/mp libraries for SPAdes. SPAdes versions>3.1.0 seems to perform better without our virtual single/pe libraries hence we trim with the `--justmp` flag.

**Note:** We achieved good results using the above commands on the bacterial samples analysed in the NxTrim paper. These had moderate coverage (<50X). If you have very high coverage samples, it might be preferable to not use the "unknown" library at all or just treat it as a single-ended library, this will remove the risk of PE contaminants causing problems.
**Note:** We achieved good results using the above commands on the bacterial samples analysed in the NxTrim paper. These had modest coverage (<50X). If you have very high coverage samples, it might be preferable to not use the "unknown" library at all or just treat it as a single-ended library, this will remove the risk of PE contaminants causing problems.

Piping trimmed reads directly to an aligner:
```
nxtrim --stdout -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz | bwa mem EcMG.fna -p - > out.sam
or
nxtrim --stdout-mp -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz | bwa mem EcMG.fna -p - > out.sam
```
The first command pipes both unknown/MP reads to stdout, this is useful if you have a high quality reference to align to. The second only prints *known* MP reads, which is useful for scaffolding purposes.

####Output:

Expand All @@ -49,7 +55,7 @@ The default behaviour expects raw fastq files from a Nextera Mate-Pair library k

####Options:

The trimmer will reverse-complement the reads such that the resulting libraries will be in Forward-Reverse orientation, this reverse-complementing can be disabled via the --norc flag.
The trimmer will reverse-complement the reads such that the resulting libraries will be in Forward-Reverse (FR) orientation, if you wish to keep your reads as Reverse-Forward then use --rf flag.

If you wish to generate pure mate-pair libraries (say for scaffolding), you can use the --justmp flag. This will only generate the unknown and mp libraries. Reads with an adapter occurring < minlength bp before the start will be completely N masked.

Expand Down
21 changes: 12 additions & 9 deletions fastqlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,18 @@ int fastqWriter::write(fqread & read) {
read.print();
}
else{
if(gzwrite(fp,"@",1)==0) die("problem writing output");
if(gzwrite(fp,(char *)read.h.c_str(),read.h.size())==0) die("problem writing output");
if( gzwrite(fp,"\n",1)==0) die("problem writing output");
if( gzwrite(fp,(char *)read.s.c_str(),read.s.size())==0) die("problem writing output");
if( gzwrite(fp,"\n",1)==0) die("problem writing output");
if( gzwrite(fp,(char *)read.l3.c_str(),read.l3.size())==0) die("problem writing output");
if( gzwrite(fp,"\n",1)==0) die("problem writing output");
if( gzwrite(fp,(char *)read.q.c_str(),read.q.size())==0) die("problem writing output");
if( gzwrite(fp,"\n",1)==0) die("problem writing output");
bool write_ok=true;
write_ok = write_ok && gzwrite(fp,"@",1)!=0;
write_ok = write_ok && gzwrite(fp,(char *)read.h.c_str(),read.h.size())!=0;
write_ok = write_ok && gzwrite(fp,"\n",1)!=0;
write_ok = write_ok && gzwrite(fp,(char *)read.s.c_str(),read.s.size())!=0;
write_ok = write_ok && gzwrite(fp,"\n",1)!=0;
write_ok = write_ok && gzwrite(fp,(char *)read.l3.c_str(),read.l3.size())!=0;
write_ok = write_ok && gzwrite(fp,"\n",1)!=0;
write_ok = write_ok && gzwrite(fp,(char *)read.q.c_str(),read.q.size())!=0;
write_ok = write_ok && gzwrite(fp,"\n",1)!=0;
if(!write_ok)
die("problem writing output");
}
return(1);
}
Expand Down
64 changes: 32 additions & 32 deletions matepair.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ int matePair::resolve_overhang(fqread & r1, fqread & r2,int a,int b) {
}
if(a<minlen) {//preceeding dna is too small.
//TODO:could possibly merge for a big read here
if(justmp) {
if(_justmp) {
mp.r1 = r1.mask();
mp.r2 = r2;
}
Expand Down Expand Up @@ -293,7 +293,7 @@ bool matePair::trimExternal(readPair& rp) {

if((a>0 && a<rp.r1.l)||(b>0 && b<rp.r2.l)) {
found = true;
if(justmp) {
if(_justmp) {
mp.r1 = rp.r1;
mp.r2 = rp.r2;
if(a<rp.r1.l)
Expand Down Expand Up @@ -363,7 +363,7 @@ int checkRight(string & s1,string & adapter,int offset,int minoverlap,float simi
int matePair::build(readPair& readpair,int minovl,float sim,int ml,bool jr,bool uh,bool pmp,bool jmp) {
// assert(readpair.r1.l==readpair.r2.l);
clear();
justmp=jmp;
_justmp=jmp;
preserve_mp=pmp;
minoverlap=minovl;
similarity=sim;
Expand Down Expand Up @@ -444,22 +444,22 @@ int matePair::build(readPair& readpair,int minovl,float sim,int ml,bool jr,bool
return(0);
}
else if(a1<(L1-minoverlap) && a2<minlen) {//r2 redundant
if(justmp)
if(_justmp)
mp=readPair(readpair.r1.window(0,a1),readpair.r2.mask()) ;
else
se = readpair.r1.window(0,a1);
if(DEBUG>1) cerr << "CASE B"<<endl;
}
else if(a2<(L2-minoverlap) && a1<minlen) {//r1 redundant
if(justmp)
if(_justmp)
mp=readPair(readpair.r1.mask(),readpair.r2.window(0,a2));
else
se = readpair.r2.window(0,a2);
if(DEBUG>1) cerr << "CASE C"<<endl;
}
else if(a1>=(L1-minoverlap) && a2<minlen) {//obvious PE
if(a1>=minlen && (L2-b2)>=minlen) {
if(justmp) {
if(_justmp) {
mp=readPair(readpair.r1.window(0,a1),readpair.r2.mask()) ;
}
else {
Expand All @@ -471,7 +471,7 @@ int matePair::build(readPair& readpair,int minovl,float sim,int ml,bool jr,bool
}
else if(a2>=(L2-minoverlap) && a1<minlen) {//obvious PE
if(a2>=minlen && (L1-b1)>=minlen) {
if(justmp){
if(_justmp){
mp=readPair(readpair.r1.mask(),readpair.r2.window(0,a2));
}
else{
Expand Down Expand Up @@ -517,58 +517,58 @@ nxtrimWriter::nxtrimWriter(string prefix,bool jmp,bool separate) {

int nxtrimWriter::open(string prefix,bool jmp,bool separate) {
if(prefix=="-")
return(open(jmp,separate));
die("bad output file name: "+prefix);
n_mp=0;
n_pe=0;
n_se=0;
n_unk=0;
justmp = jmp;

if(separate)
_justmp = jmp;
_write_un=_write_mp=_write_se=_write_pe=true;
if(separate) {
mp_out.open(prefix+"_R1.mp.fastq.gz", prefix+"_R2.mp.fastq.gz");
else
mp_out.open(prefix+".mp.fastq.gz");

if(separate)
unknown_out.open(prefix+"_R1.unknown.fastq.gz",prefix+"_R2.unknown.fastq.gz");
else
}
else {
mp_out.open(prefix+".mp.fastq.gz");
unknown_out.open(prefix+".unknown.fastq.gz");
}

if(!justmp) {
if(!_justmp) {
if(separate) pe_out.open(prefix+"_R1.pe.fastq.gz",prefix+"_R2.pe.fastq.gz");
else pe_out.open(prefix+".pe.fastq.gz");
se_out.open(prefix+".se.fastq.gz");
}
else {
_write_se=_write_pe=false;
}
return(0);
}

nxtrimWriter::nxtrimWriter(bool jmp,bool separate) {
open(jmp,separate);
}

nxtrimWriter::nxtrimWriter() {}

int nxtrimWriter::open(bool jmp,bool separate) {
int nxtrimWriter::open() {
n_mp=0;
n_pe=0;
n_se=0;
n_unk=0;
justmp = jmp;
_justmp = true;
_write_un=_write_mp=true;
_write_se=_write_pe=false;
mp_out.open("-");
pe_out.open("-");
unknown_out.open("-");
se_out.open("-");
return(0);
}

int nxtrimWriter::write(matePair m) {
n_mp+=mp_out.write(m.mp);
n_unk+=unknown_out.write(m.unknown);
if(!justmp) {
n_pe+=pe_out.write(m.pe);
n_se+=se_out.write(m.se);
}
if(DEBUG>0) cerr << "Wrote: n_mp="<<n_mp<<" n_unk="<<n_unk<<" n_pe="<<n_pe<<" n_se="<<n_se<<endl;

if(_write_mp) n_mp+=mp_out.write(m.mp);
if(_write_un) n_unk+=unknown_out.write(m.unknown);
if(_write_pe) n_pe+=pe_out.write(m.pe);
if(_write_se) n_se+=se_out.write(m.se);


if(DEBUG>0)
cerr << "Wrote: n_mp="<<n_mp<<" n_unk="<<n_unk<<" n_pe="<<n_pe<<" n_se="<<n_se<<endl;

return(0);
}
13 changes: 6 additions & 7 deletions matepair.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class matePair {
int trimUnknown();
bool trimExternal(readPair & rp);
int joinReads(fqread & r1,fqread & r2,fqread & output);
bool joinreads,use_hamming,preserve_mp,justmp;
bool joinreads,use_hamming,preserve_mp,_justmp;
fqread se;
readPair mp,pe,unknown;
int resolve_overhang(fqread & r1, fqread & r2,int a,int b);
Expand All @@ -43,16 +43,15 @@ class nxtrimWriter {

public:
nxtrimWriter();
nxtrimWriter(string prefix,bool jmp,bool separate_read_files);
nxtrimWriter(bool jmp,bool separate);
nxtrimWriter(string prefix,bool jmp,bool separate_read_files=false);
int open(string prefix,bool jmp,bool separate_read_files);
int open(bool jmp,bool separate);
int open();
bool setMP(bool val) {_write_mp=val;return(_write_mp);}
bool setUN(bool val) {_write_un=val;return(_write_un);}

int write(matePair m);
int weird,n_mp,n_unk,n_se,n_pe;//counts for each virtual library
bool justmp;

private:
bool _justmp,_write_mp,_write_se,_write_pe,_write_un;
bool print_to_stdout;
pairWriter mp_out;
pairWriter pe_out;
Expand Down
42 changes: 27 additions & 15 deletions nxtrim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ void usage() {
cerr << " -2 [ --r2 ] arg read 2 in fastq format (gzip allowed)"<<endl;
cerr << "Allowed options:"<<endl;
cerr << " -O [ --output-prefix ] arg output prefix"<<endl;
cerr << " --stdout print trimmed reads to stdout"<<endl;
cerr << " --justmp just creates a the mp/unknown libraries (reads with adapter at the start will be completely N masked)"<<endl;
cerr << " --stdout print trimmed reads to stdout (equivalent to justmp)"<<endl;
cerr << " --stdout-mp print only known MP reads to stdout (good for scaffolding)"<<endl;
cerr << " --stdout-un print only unknown reads to stdout"<<endl;
cerr << " --joinreads try to merge overhangs from R2 with R1 (default: no joining)"<<endl;
cerr << " --norc do NOT reverse-complement mate-pair reads (use this if your reads are already in FR orientation)"<<endl;
cerr << " --rf leave reads in RF orientation (or use this if your reads are already in FR orientation)"<<endl;
cerr << " --preserve-mp preserve MPs even when the corresponding PE has longer reads"<<endl;
cerr << " --ignorePF ignore chastity/purity filters in read headers"<<endl;
// cerr << " --mp just creates the mp library. Useful if you suspect a high level of contamination" <<endl;
// cerr << " --unknown just creates the unknown library"<<endl;
cerr << " --separate output paired reads in separate files (prefix_R1/prefix_r2). Default is interleaved."<<endl;
cerr << " -s, --similarity arg (=0.85) The minimum similarity between strings to be considered a match. Where hamming_distance <= ceiling( (1-similarity) * string_length )"<<endl;
cerr << " -v, --minoverlap arg (=12) The minimum overlap to be considered for matching"<<endl;
Expand All @@ -46,6 +46,8 @@ void usage() {
#define MP 6
#define UNKNOWN 7
#define SEPARATE 8
#define STDOUT_MP 9
#define STDOUT_UN 10

int main(int argc,char **argv) {
int c;
Expand All @@ -64,16 +66,20 @@ int main(int argc,char **argv) {
bool rc = true;
bool ignorePF = false;
bool write_stdout=false;
bool write_stdout_mp=false;
bool write_stdout_un=false;
bool hamming = true;//always use hamming.
bool separate=false;
static struct option loptions[] = {
{"r1",1,0,'1'},
{"r2",1,0,'2'},
{"output-prefix",1,0,'O'},
{"stdout",0,0,STDOUT},
{"stdout-mp",0,0,STDOUT_MP},
{"stdout-un",0,0,STDOUT_UN},
{"justmp",0,0,JUSTMP},
{"joinreads",0,0,JOINREADS},
{"norc",0,0,NORC},
{"rf",0,0,NORC},
{"preserve-mp",0,0,PMP},
{"ignorePF",0,0,IGNOREPF},
{"mp",0,0,MP},
Expand All @@ -94,6 +100,8 @@ int main(int argc,char **argv) {
case 'v': minoverlap = atoi(optarg); break;
case 'l': minlen = atoi(optarg); break;
case STDOUT: write_stdout=true; break;
case STDOUT_MP: write_stdout_mp=true; break;
case STDOUT_UN: write_stdout_un=true; break;
case JUSTMP:justmp=true; break;
case JOINREADS:joinreads=true; break;
case NORC:rc=false; break;
Expand All @@ -107,16 +115,17 @@ int main(int argc,char **argv) {
die("both --r1 and --r2 must be speicified");
if(write_stdout && !prefix.empty())
die("--stdout and -O are incompatible");
if(!write_stdout && prefix.empty() )
die("one of--stdout and -O must be specified");
if(!write_stdout && !write_stdout_mp && !write_stdout_un && prefix.empty() )
die("one of --stdout / --stdout-mp / --stdout-un / -O must be specified");
if(preserve_mp&&justmp)
die("the --preserve_mp and --justmp flags are incompatible!");

if(write_stdout)
if(justmp)
cerr << "Writing to stdout"<<endl;
else
die("--stdout can only be used with the --justmp flag");
if( (write_stdout+write_stdout_mp+write_stdout_un)>1)
die("only one of --stdout / --stdout-mp / --stdout-un may be specified!");

if(write_stdout||write_stdout_mp||write_stdout_un) {
cerr << "Writing to stdout"<<endl;
justmp=true;
}
else
cerr << "Output: " << prefix <<".*.fastq.gz"<<endl;

Expand All @@ -138,8 +147,11 @@ int main(int argc,char **argv) {
bool trim_warn=true;

nxtrimWriter out;
if(write_stdout)
out.open("-",justmp,separate);
if(write_stdout||write_stdout_un||write_stdout_mp) {
out.open();
if(write_stdout_un) out.setMP(false);
if(write_stdout_mp) out.setUN(false);
}
else
out.open(prefix,justmp,separate);

Expand Down
Loading