From a3bf920e939981ce57968cf8bf2024555ffa2f03 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Sun, 10 Apr 2016 17:27:48 +0100 Subject: [PATCH] stdout changes --- .gitignore | 1 - Changelog | 6 +++- Makefile | 2 +- README.md | 20 +++++++----- fastqlib.cpp | 21 +++++++------ matepair.cpp | 64 ++++++++++++++++++------------------- matepair.h | 13 ++++---- nxtrim.cpp | 42 ++++++++++++++++--------- test/alignment_summary.py | 66 +++++++++++++++++++++++++++++++++++++++ test/test.sh | 37 ++++++++++++++++++++++ 10 files changed, 199 insertions(+), 73 deletions(-) create mode 100644 test/alignment_summary.py create mode 100644 test/test.sh diff --git a/.gitignore b/.gitignore index 4e80f97..dbd9ff0 100755 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,5 @@ *.so mergeReads nxtrim -test version.h diff --git a/Changelog b/Changelog index d1fe2fd..0f22530 100755 --- a/Changelog +++ b/Changelog @@ -1,6 +1,10 @@ +2016.04.10 +*added --stdout-mp and --stdout-un functionality +*changed the --norc flag to --rf + 2016.03.27 *fixed a bug that occurs when comment field is empty issue #17 - + 2016.02.27 *fixed issue #15 *removed BOOST dependencies diff --git a/Makefile b/Makefile index a57df22..33829a7 100755 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ debug: all GIT_HASH := $(shell git describe --abbrev=4 --always ) -VERSION = v0.4.0 +VERSION = v0.4.1 GIT_VERSION = ifneq "$(wildcard .git)" "" GIT_VERSION = -$(shell git describe --always) diff --git a/README.md b/README.md index eb2be12..f6c7cc2 100755 --- a/README.md +++ b/README.md @@ -17,16 +17,14 @@ make ``` ####Usage -Trim the data: -``` -nxtrim -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz -O sample -``` -Assemble with [Velvet](https://www.ebi.ac.uk/~zerbino/velvet/): +Trimming and assembly with [Velvet](https://www.ebi.ac.uk/~zerbino/velvet/): ``` +nxtrim -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz -O sample velveth output_dir 55 -short -fastq.gz sample.se.fastq.gz -shortPaired2 -fastq.gz sample.pe.fastq.gz -shortPaired3 -fastq.gz sample.mp.fastq.gz -shortPaired4 -fastq.gz sample.unknown.fastq.gz velvetg output_dir -exp_cov auto -cov_cutoff auto -shortMatePaired4 yes ``` +the above approach corresponds to the results in the NxTrim publication. Trimming and assembly with [SPAdes](http://bioinf.spbau.ru/spades): ``` @@ -36,7 +34,15 @@ spades.py -t 4 --hqmp1-12 sample.allmp.fastq.gz -o output_dir ``` We concatenate the unknown/mp libraries for SPAdes. SPAdes versions>3.1.0 seems to perform better without our virtual single/pe libraries hence we trim with the `--justmp` flag. -**Note:** We achieved good results using the above commands on the bacterial samples analysed in the NxTrim paper. These had moderate coverage (<50X). If you have very high coverage samples, it might be preferable to not use the "unknown" library at all or just treat it as a single-ended library, this will remove the risk of PE contaminants causing problems. +**Note:** We achieved good results using the above commands on the bacterial samples analysed in the NxTrim paper. These had modest coverage (<50X). If you have very high coverage samples, it might be preferable to not use the "unknown" library at all or just treat it as a single-ended library, this will remove the risk of PE contaminants causing problems. + +Piping trimmed reads directly to an aligner: +``` +nxtrim --stdout -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz | bwa mem EcMG.fna -p - > out.sam +or +nxtrim --stdout-mp -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz | bwa mem EcMG.fna -p - > out.sam +``` +The first command pipes both unknown/MP reads to stdout, this is useful if you have a high quality reference to align to. The second only prints *known* MP reads, which is useful for scaffolding purposes. ####Output: @@ -49,7 +55,7 @@ The default behaviour expects raw fastq files from a Nextera Mate-Pair library k ####Options: -The trimmer will reverse-complement the reads such that the resulting libraries will be in Forward-Reverse orientation, this reverse-complementing can be disabled via the --norc flag. +The trimmer will reverse-complement the reads such that the resulting libraries will be in Forward-Reverse (FR) orientation, if you wish to keep your reads as Reverse-Forward then use --rf flag. If you wish to generate pure mate-pair libraries (say for scaffolding), you can use the --justmp flag. This will only generate the unknown and mp libraries. Reads with an adapter occurring < minlength bp before the start will be completely N masked. diff --git a/fastqlib.cpp b/fastqlib.cpp index 023b9f5..bc23a10 100755 --- a/fastqlib.cpp +++ b/fastqlib.cpp @@ -187,15 +187,18 @@ int fastqWriter::write(fqread & read) { read.print(); } else{ - if(gzwrite(fp,"@",1)==0) die("problem writing output"); - if(gzwrite(fp,(char *)read.h.c_str(),read.h.size())==0) die("problem writing output"); - if( gzwrite(fp,"\n",1)==0) die("problem writing output"); - if( gzwrite(fp,(char *)read.s.c_str(),read.s.size())==0) die("problem writing output"); - if( gzwrite(fp,"\n",1)==0) die("problem writing output"); - if( gzwrite(fp,(char *)read.l3.c_str(),read.l3.size())==0) die("problem writing output"); - if( gzwrite(fp,"\n",1)==0) die("problem writing output"); - if( gzwrite(fp,(char *)read.q.c_str(),read.q.size())==0) die("problem writing output"); - if( gzwrite(fp,"\n",1)==0) die("problem writing output"); + bool write_ok=true; + write_ok = write_ok && gzwrite(fp,"@",1)!=0; + write_ok = write_ok && gzwrite(fp,(char *)read.h.c_str(),read.h.size())!=0; + write_ok = write_ok && gzwrite(fp,"\n",1)!=0; + write_ok = write_ok && gzwrite(fp,(char *)read.s.c_str(),read.s.size())!=0; + write_ok = write_ok && gzwrite(fp,"\n",1)!=0; + write_ok = write_ok && gzwrite(fp,(char *)read.l3.c_str(),read.l3.size())!=0; + write_ok = write_ok && gzwrite(fp,"\n",1)!=0; + write_ok = write_ok && gzwrite(fp,(char *)read.q.c_str(),read.q.size())!=0; + write_ok = write_ok && gzwrite(fp,"\n",1)!=0; + if(!write_ok) + die("problem writing output"); } return(1); } diff --git a/matepair.cpp b/matepair.cpp index 3c7a3c0..d882623 100755 --- a/matepair.cpp +++ b/matepair.cpp @@ -166,7 +166,7 @@ int matePair::resolve_overhang(fqread & r1, fqread & r2,int a,int b) { } if(a0 && a0 && b1) cerr << "CASE B"<=(L1-minoverlap) && a2=minlen && (L2-b2)>=minlen) { - if(justmp) { + if(_justmp) { mp=readPair(readpair.r1.window(0,a1),readpair.r2.mask()) ; } else { @@ -471,7 +471,7 @@ int matePair::build(readPair& readpair,int minovl,float sim,int ml,bool jr,bool } else if(a2>=(L2-minoverlap) && a1=minlen && (L1-b1)>=minlen) { - if(justmp){ + if(_justmp){ mp=readPair(readpair.r1.mask(),readpair.r2.window(0,a2)); } else{ @@ -517,58 +517,58 @@ nxtrimWriter::nxtrimWriter(string prefix,bool jmp,bool separate) { int nxtrimWriter::open(string prefix,bool jmp,bool separate) { if(prefix=="-") - return(open(jmp,separate)); + die("bad output file name: "+prefix); n_mp=0; n_pe=0; n_se=0; n_unk=0; - justmp = jmp; - - if(separate) + _justmp = jmp; + _write_un=_write_mp=_write_se=_write_pe=true; + if(separate) { mp_out.open(prefix+"_R1.mp.fastq.gz", prefix+"_R2.mp.fastq.gz"); - else - mp_out.open(prefix+".mp.fastq.gz"); - - if(separate) unknown_out.open(prefix+"_R1.unknown.fastq.gz",prefix+"_R2.unknown.fastq.gz"); - else + } + else { + mp_out.open(prefix+".mp.fastq.gz"); unknown_out.open(prefix+".unknown.fastq.gz"); + } - if(!justmp) { + if(!_justmp) { if(separate) pe_out.open(prefix+"_R1.pe.fastq.gz",prefix+"_R2.pe.fastq.gz"); else pe_out.open(prefix+".pe.fastq.gz"); se_out.open(prefix+".se.fastq.gz"); + } + else { + _write_se=_write_pe=false; } return(0); } -nxtrimWriter::nxtrimWriter(bool jmp,bool separate) { - open(jmp,separate); -} - nxtrimWriter::nxtrimWriter() {} -int nxtrimWriter::open(bool jmp,bool separate) { +int nxtrimWriter::open() { n_mp=0; n_pe=0; n_se=0; n_unk=0; - justmp = jmp; + _justmp = true; + _write_un=_write_mp=true; + _write_se=_write_pe=false; mp_out.open("-"); - pe_out.open("-"); unknown_out.open("-"); - se_out.open("-"); return(0); } int nxtrimWriter::write(matePair m) { - n_mp+=mp_out.write(m.mp); - n_unk+=unknown_out.write(m.unknown); - if(!justmp) { - n_pe+=pe_out.write(m.pe); - n_se+=se_out.write(m.se); - } - if(DEBUG>0) cerr << "Wrote: n_mp="<1) + die("only one of --stdout / --stdout-mp / --stdout-un may be specified!"); + + if(write_stdout||write_stdout_mp||write_stdout_un) { + cerr << "Writing to stdout"<thr and read.pos < (sam.lengths[read.tid] - thr) and read.pnext>thr and read.pnext< (sam.lengths[read.tid] - thr) + insert_size = max(read.pos+L,read.pnext+L) - min(read.pos,read.pnext) + not_too_small = insert_size > L + not_identical = read.pos!=read.pnext + + if not read.is_unmapped and not read.mate_is_unmapped and not_identical: + if not_on_end and not_too_small: + if read.pos < read.pnext: + if (read.flag & 0x10): + k1 = 'R' + else: + k1 = 'F' + + if (read.flag & 0x20): + k2='R' + else: + k2='F' + o=k1+k2 + d[k1+k2] += 1 + isizes[k1+k2].append(insert_size) +# sys.stderr.write("%s %d\n"%(k1+k2,insert_size)) + else: + if (read.flag & 0x10): + k1 = 'R' + else: + k1 = 'F' + + if (read.flag & 0x20): + k2='R' + else: + k2='F' + o=k2+k1 + if(args.output!=None): + if o=='RF': + out_rf.write((read)) + if o=='FR': + out_fr.write((read)) + + print "Orientation","Frequency".rjust(10," "),"Median insert size".rjust(20," ") + for k in ['FR','RF','FF','RR']: + x=isizes[k] + x.sort() + print k.ljust(9," "),("%d"%d[k]).rjust(12," "),("%d"%x[len(x)/2]).rjust(20," ") + diff --git a/test/test.sh b/test/test.sh new file mode 100644 index 0000000..e4a4774 --- /dev/null +++ b/test/test.sh @@ -0,0 +1,37 @@ +#get data +r1=EcMG1_ATGTCA_L001_R1_001.fastq.gz +r2=EcMG1_ATGTCA_L001_R2_001.fastq.gz +ref=EcMG.fna +if [ ! -f $ref ] +then + wget https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${ref} +fi +if [ ! -f $r1 ] +then + wget https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${r1} +fi +if [ ! -f $r2 ] +then + wget https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${r2} +fi +bwa index $ref + +#test stdout and alignment +out=EcMG.bam +../nxtrim --stdout -1 $r1 -2 $r2 | bwa mem $ref -p - | samtools view -Ob -o $out + +out=EcMG.rf.bam +../nxtrim --rf --stdout -1 $r1 -2 $r2 | bwa mem $ref -p - | samtools view -Ob -o $out + +out=EcMG.mp.bam +../nxtrim --stdout-mp -1 $r1 -2 $r2 | bwa mem $ref -p - | samtools view -Ob -o $out + +out=EcMG.un.bam +../nxtrim --stdout-un -1 $r1 -2 $r2 | bwa mem $ref -p - | samtools view -Ob -o $out + + +##assemble with velvet +../nxtrim -1 $r1 -2 $r2 -O EcMG +velveth output_dir 61 -short -fastq.gz EcMG.se.fastq.gz -shortPaired2 -fastq.gz EcMG.pe.fastq.gz -shortPaired3 -fastq.gz EcMG.mp.fastq.gz -shortPaired4 -fastq.gz EcMG.unknown.fastq.gz +velvetg output_dir -exp_cov auto -cov_cutoff auto -shortMatePaired4 yes +