diff --git a/.dockerignore b/.dockerignore index 0db1033..6236564 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,3 +9,4 @@ /docs.tar.gz /setup.sh /prerelease.sh +/blib diff --git a/.gitignore b/.gitignore index da0f63c..2a7d497 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,8 @@ examples/cgp_gnos_pull.ini /c/c_tests/tests_log /bin/diff_bams /bin/reheadSQ +/bin/mismatchQc +/bin/mmFlagModifier /c/c_tests/01_bam_stats_output_tests /c/c_tests/02_bam_access_tests /c/c_tests/03_bam_stats_calcs_tests diff --git a/CHANGES.md b/CHANGES.md index 1217470..5b61995 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,14 @@ # CHANGES +## 5.1.0 + +* Base image updated to Focal (Ubuntu 20.04). +* Majority of biobambam2 replaced with samtools functions. +* Reads undergo full collate when mapping from BAM/CRAM (bwa-mem2 prep). +* Duplicate marking `samtools markdup --mode` options exposed to `bwa_mem.pl`. + * Lanes mapped with earlier versions of PCAP-core cannot be merged without reporocessing to add "mate score tag" via `samtools fixmate`. +* Scramble option for `bwa_mem.pl` deprecated, relevant option for fast CRAM random access exposed. + ## 5.0.5 * Add `noindex` commandline flag to `merge_or_mark.pl` for bammerge calls. Only permitted alongisde `qnamesort` diff --git a/Dockerfile b/Dockerfile index 68e5e0d..c112cfa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,38 +1,42 @@ -FROM quay.io/wtsicgp/cgpbigwig:1.1.0 as builder +FROM quay.io/wtsicgp/cgpbigwig:1.3.0 as builder USER root -ARG BBB2_URL="https://gitlab.com/german.tischler/biobambam2/uploads/178774a8ece96d2201fcd0b5249884c7/biobambam2-2.0.146-release-20191030105216-x86_64-linux-gnu.tar.xz" +# ALL tool versions used by opt-build.sh +# need to keep in sync with setup.sh + +# newer gitlab versions do not work +ARG BBB2_URL="https://github.com/gt1/biobambam2/releases/download/2.0.87-release-20180301132713/biobambam2-2.0.87-release-20180301132713-x86_64-etch-linux-gnu.tar.gz" ARG BWAMEM2_URL="https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.0pre2/bwa-mem2-2.0pre2_x64-linux.tar.bz2" ARG STADEN="https://iweb.dl.sourceforge.net/project/staden/staden/2.0.0b11/staden-2.0.0b11-2016-linux-x86_64.tar.gz" ARG VER_BIODBHTS="3.01" ARG VER_BWA="v0.7.17" -ARG VER_HTSLIB="1.9" -ARG VER_SAMTOOLS="1.9" +ARG VER_HTSLIB="1.10.2" +ARG VER_SAMTOOLS="1.10" RUN apt-get -yq update -RUN apt-get install -yq --no-install-recommends\ - apt-transport-https\ - locales\ - curl\ - ca-certificates\ - libperlio-gzip-perl\ - make\ - bzip2\ - gcc\ - psmisc\ - time\ - zlib1g-dev\ - libbz2-dev\ - liblzma-dev\ - libcurl4-gnutls-dev\ - libncurses5-dev\ - nettle-dev\ - libp11-kit-dev\ - libtasn1-dev\ - libgnutls-dev\ - libgd-dev\ - libdb-dev +RUN apt-get install -yq --no-install-recommends apt-transport-https +RUN apt-get install -yq --no-install-recommends locales +RUN apt-get install -yq --no-install-recommends curl +RUN apt-get install -yq --no-install-recommends ca-certificates +RUN apt-get install -yq --no-install-recommends libperlio-gzip-perl +RUN apt-get install -yq --no-install-recommends make +RUN apt-get install -yq --no-install-recommends bzip2 +RUN apt-get install -yq --no-install-recommends gcc +RUN apt-get install -yq --no-install-recommends psmisc +RUN apt-get install -yq --no-install-recommends time +RUN apt-get install -yq --no-install-recommends zlib1g-dev +RUN apt-get install -yq --no-install-recommends libbz2-dev +RUN apt-get install -yq --no-install-recommends liblzma-dev +RUN apt-get install -yq --no-install-recommends libcurl4-gnutls-dev +RUN apt-get install -yq --no-install-recommends libncurses5-dev +RUN apt-get install -yq --no-install-recommends nettle-dev +RUN apt-get install -yq --no-install-recommends libp11-kit-dev +RUN apt-get install -yq --no-install-recommends libtasn1-dev +RUN apt-get install -yq --no-install-recommends libdb-dev +RUN apt-get install -yq --no-install-recommends libgnutls28-dev +RUN apt-get install -yq --no-install-recommends xz-utils +RUN apt-get install -yq --no-install-recommends libexpat1-dev RUN locale-gen en_US.UTF-8 RUN update-locale LANG=en_US.UTF-8 @@ -54,11 +58,11 @@ RUN bash build/opt-build.sh $OPT COPY . . RUN bash build/opt-build-local.sh $OPT -FROM ubuntu:16.04 +FROM ubuntu:20.04 LABEL maintainer="cgphelp@sanger.ac.uk"\ uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Sanger Institute" \ - version="5.0.5" \ + version="5.1.0" \ description="pcap-core" ENV OPT /opt/wtsi-cgp @@ -67,6 +71,7 @@ ENV PATH $OPT/bin:$PATH ENV PERL5LIB $OPT/lib/perl5 ENV LD_LIBRARY_PATH $OPT/lib:$OPT/scramble/lib ENV LC_ALL C +ENV GPERF_FOR_BWA /usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 RUN apt-get -yq update RUN apt-get install -yq --no-install-recommends \ @@ -82,8 +87,10 @@ zlib1g \ liblzma5 \ libncurses5 \ p11-kit \ -libcurl3 \ +libcurl3-gnutls \ +libcurl4 \ moreutils \ +google-perftools \ unattended-upgrades && \ unattended-upgrade -d -v && \ apt-get remove -yq unattended-upgrades && \ diff --git a/README.md b/README.md index 9f5b418..8773658 100644 --- a/README.md +++ b/README.md @@ -29,18 +29,12 @@ Available programs are described in the [wiki][wiki]. ## Docker, Singularity and Dockstore -There are docker and dockstore.org wrappers for this project at [dockstore-cgpmap][dockstore-cgpmap]. +There are dockstore.org CWL and wrappers for this project at [dockstore-cgpmap][dockstore-cgpmap]. -The docker image is held on [quay.io][quay-io-cgpmap]. - -The CWL bindings of `dockstore-cgpmap` specifically target execution of the BWA mem mapping flow, -however all tools are contained in the image and can be used if you construct the relevant docker -commands. +The docker image is held on [quay.io][quay-io-pcap-core]. The docker image is know to work correctly after import into a singularity image. -See the [dockstore-cgpmap][dockstore-cgpmap] documentation for more detail. - ## Dependencies/Install Please be aware that this expects basic C compilation libraries and tools to be available, most are listed in `INSTALL`. @@ -69,13 +63,10 @@ Please see the respective licence for each before use. ### Cutting the release 1. Update `lib/PCAP.pm` to the correct version. -2. Ensure upgrade path for new version number is added to `lib/PCAP.pm`. +2. Update `Dockerfile` to the correct version. 3. Update `CHANGES.md` to show major items. -4. Run `./prerelease.sh` -5. Check all tests and coverage reports are acceptable. -6. Commit the updated docs tree and updated module/version. -7. Push commits. -8. Use the GitHub tools to draft a release. +4. Push commits and verify with Sanger internal CI. +5. Use the GitHub tools to draft a release. @@ -87,7 +78,7 @@ Please see the respective licence for each before use. [cancerit_github]: https://github.com/cancerit [old_repo]: https://github.com/ICGC-TCGA-PanCancer/PCAP-core [dockstore-cgpmap]: https://github.com/cancerit/dockstore-cgpmap -[quay-io-cgpmap]: https://quay.io/repository/wtsicgp/dockstore-cgpmap +[quay-io-pcap-core]: https://quay.io/repository/wtsicgp/pcap-core [travis-base]: https://travis-ci.org/cancerit/PCAP-core diff --git a/bin/bwa_mem.pl b/bin/bwa_mem.pl index e80f8aa..6f7afe4 100755 --- a/bin/bwa_mem.pl +++ b/bin/bwa_mem.pl @@ -52,15 +52,14 @@ my $options = setup(); my $threads = PCAP::Threaded->new($options->{'threads'}); - &PCAP::Threaded::disable_out_err if(exists $options->{'index'}); # register processes - $threads->add_function('split', \&PCAP::Bwa::split_in); - $threads->add_function('bwamem', \&PCAP::Bwa::bwa_mem, exists $options->{'index'} ? 1 : $options->{'map_threads'}); + $threads->add_function('split', \&PCAP::Bwa::split_in, split_threads($options)); + $threads->add_function('bwamem', \&PCAP::Bwa::bwa_mem, exists $options->{'index'} ? 1 : $options->{'map_threads'}); PCAP::Bwa::mem_setup($options) if(!exists $options->{'process'} || $options->{'process'} eq 'setup'); - $threads->run($options->{'max_split'}, 'split', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'split'); + $threads->run($options->{'max_split'}, 'split', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'split'); if(!exists $options->{'process'} || $options->{'process'} eq 'bwamem') { $options->{'max_index'} = PCAP::Bwa::mem_mapmax($options); @@ -78,6 +77,24 @@ } } +sub split_threads { + my $options = shift; + my $div = 1; + my $threads_per_split = 1; + if(exists $options->{index}) { + $div = 1; + $threads_per_split = $options->{threads}; + } + elsif($options->{raw_files}->[0] =~ m/(bam|cram)$/) { + my $inputs = scalar @{$options->{raw_files}}; + $threads_per_split = int ($options->{threads} / $inputs); + $threads_per_split = 1 if($threads_per_split < 1); + $div = $threads_per_split; + } + $options->{threads_per_split} = $threads_per_split; # so can be used later + return $div; # so can be used as return +} + sub cleanup { my $options = shift; my $tmpdir = $options->{'tmp'}; @@ -91,6 +108,8 @@ sub setup { 'mmqcfrac' => 0.05, 'threads' => 1, 'fragment' => 10, + 'dupmode' => 't', + 'seqslice' => 10000, 'csi' => undef, ); @@ -116,6 +135,8 @@ sub setup { 'q|mmqc' => \$opts{'mmqc'}, 'qf|mmqcfrac:f' => \$opts{'mmqcfrac'}, 'bm2|bwamem2' => \$opts{'bwamem2'}, + 'd|dupmode:s' => \$opts{'dupmode'}, + 'ss|seqslice:i' => $opts{'seqslice'}, ) or pod2usage(2); pod2usage(-verbose => 1, -exitval => 0) if(defined $opts{'h'}); @@ -145,10 +166,14 @@ sub setup { die "ERROR: Please generate $opts{dict}, e.g.\n\t\$ samtools dict -a \$ASSEMBLY -s \$SPECIES $opts{reference} > $opts{dict}\n"; } + if(defined $opts{'scramble'}) { + die "ERROR: -scramble option is deprecated, please see -seqslice\n"; + } + delete $opts{'process'} unless(defined $opts{'process'}); delete $opts{'index'} unless(defined $opts{'index'}); delete $opts{'bwa'} unless(defined $opts{'bwa'}); - delete $opts{'scramble'} unless(defined $opts{'scramble'}); + delete $opts{'scramble'}; delete $opts{'bwa_pl'} unless(defined $opts{'bwa_pl'}); delete $opts{'mmqc'} unless(defined $opts{'mmqc'}); delete $opts{'csi'} unless(defined $opts{'csi'}); @@ -220,11 +245,12 @@ =head1 SYNOPSIS Optional parameters: -bwamem2 -bm2 Use bwa-mem2 instead of bwa. -fragment -f Split input into fragments of X million repairs [10] + - only applies to fastq[.gz] input -nomarkdup -n Don't mark duplicates [flag] -csi Use CSI index instead of BAI for BAM files [flag]. -cram -c Output cram, see '-sc' [flag] - -scramble -sc Single quoted string of parameters to pass to Scramble when '-c' used - - '-I,-O' are used internally and should not be provided + -seqslice -ss seqs_per_slice for CRAM compression [samtools default: 10000] + -scramble -sc DEPRECATED -bwa -b Single quoted string of additional parameters to pass to BWA - '-t,-p,-R' are used internally and should not be provided. - '-v' is set to 1 unless '-bwa' is set. @@ -234,12 +260,15 @@ =head1 SYNOPSIS -mmqc -q Mark reads as QCFAIL (0x200, 512) if mismatch rate exceeded [flag] - Please see 'bwa_mem.pl -m' -mmqcfrac -qf Mismatch fraction for -mmqc [0.05] + -dupmode -d see "samtools markdup -m" [t] Targeted processing: -process -p Only process this step then exit, optionally set -index + setup - checks and configure workspace (-index N/A) + split - split data by readgroup and chunk size (if applicable) bwamem - only applicable if input is bam mark - Run duplicate marking (-index N/A) - stats - Generates the *.bas file for the final BAM. + stats - Generates the *.bas file for the final BAM (-index N/A) -index -i Optionally restrict '-p' to single job bwamem - 1.. @@ -249,6 +278,7 @@ =head1 SYNOPSIS https://github.com/gperftools/ (assuming number of cores not exceeded) If available specify the path to 'gperftools/lib/libtcmalloc_minimal.so'. - NOT APPLIED TO bwa-mem2 + Falls back to environment variable GPERF_FOR_BWA when not set, or nothing. Other: -jobs -j For a parallel step report the number of jobs required diff --git a/bin/merge_or_mark.pl b/bin/merge_or_mark.pl index 11616dd..a87c48f 100755 --- a/bin/merge_or_mark.pl +++ b/bin/merge_or_mark.pl @@ -39,7 +39,6 @@ use PCAP::Bwa; use version; -const my $COORD_SORT_ORDER => 'coordinate'; const my $QUERYNAME_SORT_ORDER => 'queryname'; const my @VALID_PROCESS => qw(setup mark stats); const my %INDEX_FACTOR => ( 'setup' => 1, @@ -72,7 +71,8 @@ sub setup { my %opts = ( 'threads' => 1, 'csi' => undef, - 'sortorder' => $COORD_SORT_ORDER, + 'dupmode' => 't', + 'seqslice' => 10000, ); GetOptions( 'h|help' => \$opts{'h'}, @@ -84,11 +84,13 @@ sub setup { 's|sample=s' => \$opts{'sample'}, 'n|nomarkdup' => \$opts{'nomarkdup'}, 'p|process=s' => \$opts{'process'}, - 'q|querynamesort' => \$opts{'qnamesort'}, + 'q|qnamesort' => \$opts{'qnamesort'}, 'i|noindex' => \$opts{'noindex'}, 'csi' => \$opts{'csi'}, 'c|cram' => \$opts{'cram'}, 'sc|scramble=s' => \$opts{'scramble'}, + 'd|dupmode:s' => \$opts{'dupmode'}, + 'ss|seqslice:i' => $opts{'seqslice'}, ) or pod2usage(2); pod2usage(-verbose => 1, -exitval => 0) if(defined $opts{'h'}); @@ -113,9 +115,13 @@ sub setup { die "ERROR: Please generate $opts{dict}, e.g.\n\t\$ samtools dict -a \$ASSEMBLY -s \$SPECIES $opts{reference} > $opts{dict}\n"; } + if(defined $opts{'scramble'}) { + die "ERROR: -scramble option is deprecated, please see -seqslice\n"; + } + delete $opts{'process'} unless(defined $opts{'process'}); delete $opts{'index'} unless(defined $opts{'index'}); - delete $opts{'scramble'} unless(defined $opts{'scramble'}); + delete $opts{'scramble'}; delete $opts{'csi'} unless(defined $opts{'csi'}); if($opts{'qnamesort'} && !$opts{'nomarkdup'}){ die "ERROR: -qnamesort can only be used in conjunction with -nomarkdups\n"; @@ -123,7 +129,6 @@ sub setup { if($opts{'noindex'} && !$opts{'qnamesort'}){ die "ERROR: -noindex can only be used in conjunction with -qnamesort\n"; } - $opts{'sortorder'} = $QUERYNAME_SORT_ORDER if($opts{'qnamesort'}); if($opts{'threads'} > 4) { warn "Setting 'threads' to 4 as higher values are of limited value\n"; @@ -172,18 +177,19 @@ =head1 SYNOPSIS -nomarkdup -n Don't mark duplicates [flag] -qnamesort -q Use queryname sorting flag in bammerge rather than coordinate. [flag]. To be used in conjunction with -nomarkdup only - -noindex -i Don't attempt to index the merged file. Only available in conjunction with + -noindex -i Don't attempt to index the merged file. Only available in conjunction with -qnamesort. -csi Use CSI index instead of BAI for BAM files [flag]. -cram -c Output cram, see '-sc' [flag] - -scramble -sc Single quoted string of parameters to pass to Scramble when '-c' used - - '-I,-O' are used internally and should not be provided + -seqslice -ss seqs_per_slice for CRAM compression [samtools default: 10000] + -scramble -sc DEPRECATED + -dupmode -d see "samtools markdup -m" [t] Targeted processing: - -process -p Only process this step then exit, optionally set -index - bwamem - only applicable if input is bam - mark - Run duplicate marking (-index N/A) - stats - Generates the *.bas file for the final BAM. + -process -p Only process this step then exit + setup - only applicable if input is bam + mark - Run duplicate marking + stats - Generates the *.bas file for the final BAM Other: -help -h Brief help message. @@ -261,6 +267,8 @@ =head2 OPTIONAL parameters =item B<-scramble> +DEPRECATED - see -seqslice + Single quoted string of parameters to pass to Scramble when '-c' used. Please see the Scramble documentation for details. diff --git a/build/opt-build-local.sh b/build/opt-build-local.sh index 2e0abfa..7d13a48 100644 --- a/build/opt-build-local.sh +++ b/build/opt-build-local.sh @@ -40,9 +40,11 @@ mkdir -p $INST_PATH/bin # make sure tools installed can see the install loc of libraries set +u +BB_INST=$INST_PATH/biobambam2 export LD_LIBRARY_PATH=`echo $INST_PATH/lib:$LD_LIBRARY_PATH | perl -pe 's/:\$//;'` -export PATH=`echo $INST_PATH/bin:$PATH | perl -pe 's/:\$//;'` -export MANPATH=`echo $INST_PATH/man:$INST_PATH/share/man:$MANPATH | perl -pe 's/:\$//;'` +export PATH=`echo $INST_PATH/bin:$BB_INST/bin:$PATH | perl -pe 's/:\$//;'` +export MANPATH=`echo $INST_PATH/man:$BB_INST/man:$INST_PATH/share/man:$MANPATH | perl -pe 's/:\$//;'` +export PERL5LIB=`echo $INST_PATH/lib/perl5:$PERL5LIB | perl -pe 's/:\$//;'` set -u ##### PCAP-core installation diff --git a/build/opt-build.sh b/build/opt-build.sh index 036b78b..874a46c 100755 --- a/build/opt-build.sh +++ b/build/opt-build.sh @@ -13,7 +13,6 @@ if [ "$#" -lt "1" ] ; then exit 1 fi - # get path to this script SCRIPT_PATH=`dirname $0`; SCRIPT_PATH=`(cd $SCRIPT_PATH && pwd)` @@ -42,12 +41,20 @@ mkdir -p $SETUP_DIR/distro # don't delete the actual distro directory until the mkdir -p $INST_PATH/bin cd $SETUP_DIR +# make sure tools installed can see the install loc of libraries +set +u +export LD_LIBRARY_PATH=`echo $INST_PATH/lib:$LD_LIBRARY_PATH | perl -pe 's/:\$//;'` +export PATH=`echo $INST_PATH/bin:$BB_INST/bin:$PATH | perl -pe 's/:\$//;'` +export MANPATH=`echo $INST_PATH/man:$BB_INST/man:$INST_PATH/share/man:$MANPATH | perl -pe 's/:\$//;'` +export PERL5LIB=`echo $INST_PATH/lib/perl5:$PERL5LIB | perl -pe 's/:\$//;'` +set -u + ## biobambam2 first BB_INST=$INST_PATH/biobambam2 if [ ! -e $SETUP_DIR/bbb2.sucess ]; then curl -sSL --retry 10 $BBB2_URL > distro.tar.gz mkdir -p $BB_INST - tar --strip-components 3 -C $BB_INST -Jxf distro.tar.gz + tar --strip-components 3 -C $BB_INST -zxf distro.tar.gz rm -f $BB_INST/bin/curl # don't let this file in SSL doesn't work rm -rf distro.* distro/* touch $SETUP_DIR/bbb2.success @@ -70,7 +77,7 @@ rm -f $SETUP_DIR/cpanm ## scramble (from staden) if [ ! -e $SETUP_DIR/staden.success ]; then curl -sSL --retry 10 $STADEN > distro.tar.gz - rm -rf distro/* + rm -rf distro/* $OPT/scramble mkdir -p $OPT/scramble tar --strip-components 1 -C distro -xzf distro.tar.gz cp -r distro/* $OPT/scramble @@ -121,6 +128,7 @@ fi if [ ! -e $SETUP_DIR/Bio-DB-HTS.success ]; then ## add perl deps cpanm --no-wget --no-interactive --notest --mirror http://cpan.metacpan.org -l $INST_PATH Module::Build + cpanm --no-wget --no-interactive --notest --mirror http://cpan.metacpan.org -l $INST_PATH XML::Parser cpanm --no-wget --no-interactive --notest --mirror http://cpan.metacpan.org -l $INST_PATH Bio::Root::Version curl -sSL --retry 10 https://github.com/Ensembl/Bio-DB-HTS/archive/${VER_BIODBHTS}.tar.gz > distro.tar.gz @@ -137,4 +145,3 @@ if [ ! -e $SETUP_DIR/Bio-DB-HTS.success ]; then fi cd $HOME -rm -rf $SETUP_DIR diff --git a/c/mismatchQc.c b/c/mismatchQc.c index 3da8656..216d4e7 100644 --- a/c/mismatchQc.c +++ b/c/mismatchQc.c @@ -369,9 +369,9 @@ int main(int argc, char *argv[]){ hts_opt_free(out_opts); //Add program line to header - SAM_hdr *cram_head = bam_header_to_cram(head); + SAM_hdr *cram_head = sam_hdr_dup(head); check(cram_head != NULL,"Error converting bam header to cram for PG add."); - int chk_h = sam_hdr_add_PG(cram_head,prog_id,"CL",prog_cl,"DS",prog_desc,"VN",VERSION,NULL); + int chk_h = sam_hdr_add_pg(cram_head,prog_id,"CL",prog_cl,"DS",prog_desc,"VN",VERSION,NULL); check(chk_h==0,"Error adding PG line to header."); //Reference setup if CRAM output if (wflags & W_CRAM) { @@ -398,7 +398,7 @@ int main(int argc, char *argv[]){ hts_set_opt(output, HTS_OPT_THREAD_POOL, &p); } - new_head = cram_header_to_bam(cram_head); + new_head = sam_hdr_dup(cram_head); int hd_chk = sam_hdr_write(output, new_head); check(hd_chk!=-1,"Error writing header to output file."); diff --git a/c/mmFlagModifier.c b/c/mmFlagModifier.c index 4a43476..eda8157 100644 --- a/c/mmFlagModifier.c +++ b/c/mmFlagModifier.c @@ -272,9 +272,9 @@ int main(int argc, char *argv[]){ hts_opt_free(out_opts); //Add program line to header - SAM_hdr *cram_head = bam_header_to_cram(head); + SAM_hdr *cram_head = sam_hdr_dup(head); check(cram_head != NULL,"Error converting bam header to cram for PG add."); - int chk_h = sam_hdr_add_PG(cram_head,prog_id,"CL",prog_cl,"DS",prog_desc,"VN",VERSION,NULL); + int chk_h = sam_hdr_add_pg(cram_head,prog_id,"CL",prog_cl,"DS",prog_desc,"VN",VERSION,NULL); check(chk_h==0,"Error adding PG line to header."); //Reference setup if CRAM output if (wflags & W_CRAM) { @@ -302,7 +302,7 @@ int main(int argc, char *argv[]){ hts_set_opt(output, HTS_OPT_THREAD_POOL, &p); } - new_head = cram_header_to_bam(cram_head); + new_head = sam_hdr_dup(cram_head); int hd_chk = sam_hdr_write(output, new_head); check(hd_chk!=-1,"Error writing header to output file."); @@ -325,7 +325,7 @@ int main(int argc, char *argv[]){ } //Check to see if the read has the mm tag required. int has_tag = 0; - has_tag = check_mm_tag(b); + has_tag = check_mm_tag(b); check(has_tag>=0,"Error checking for mm tag in read."); if(has_tag){ marked_count++; @@ -340,7 +340,7 @@ int main(int argc, char *argv[]){ int res = sam_write1(output,new_head,b); check(res>=0,"Error writing read to output file."); }//End of iteration through each read in the xam file - + bam_destroy1(b); bam_hdr_destroy(head); bam_hdr_destroy(new_head); diff --git a/docs.tar.gz b/docs.tar.gz deleted file mode 100644 index abb2424..0000000 Binary files a/docs.tar.gz and /dev/null differ diff --git a/lib/PCAP.pm b/lib/PCAP.pm index fbdcfcd..d75980c 100644 --- a/lib/PCAP.pm +++ b/lib/PCAP.pm @@ -28,7 +28,7 @@ use FindBin qw($Bin); use File::Which qw(which); # don't use autodie, only core perl in here -our $VERSION = '5.0.5'; +our $VERSION = '5.1.0'; our @EXPORT = qw($VERSION _which); const my $LICENSE => diff --git a/lib/PCAP/Bam.pm b/lib/PCAP/Bam.pm index 16f556a..19ba85b 100644 --- a/lib/PCAP/Bam.pm +++ b/lib/PCAP/Bam.pm @@ -36,19 +36,6 @@ use Data::UUID; use PCAP::Threaded; const my $BAMCOLLATE => q{(%s colsbs=268435456 collate=1 reset=1 exclude=SECONDARY,QCFAIL,SUPPLEMENTARY classes=F,F2 T=%s filename=%s level=1 > %s)}; -const my $MISMATCHQC => q{| %s -l 0 -t %.2f -p }; -const my $BAMBAM_DUP => q{%s level=0 %s | %s tmpfile=%s level=0 markthreads=%d M=%s.met %s| pee '%s tmpfile=%s index=1 md5=1 numthreads=%d md5filename=%s.md5 indexfilename=%s.%s > %s' '%s -o %s.bas -@ %d'}; -const my $BAMBAM_MERGE => q{%s %s tmpfile=%s level=0 %s| pee '%s tmpfile=%s index=1 md5=1 numthreads=%d md5filename=%s.md5 indexfilename=%s.%s > %s' '%s -o %s.bas -@ %d'}; -const my $BAMBAM_DUP_CRAM => q{%s level=0 %s | %s tmpfile=%s M=%s.met markthreads=%s level=0 %s| %s -r %s -t %d -I bam -O cram %s | tee %s | %s index - %s.crai}; -const my $BAMBAM_MERGE_CRAM => q{%s %s tmpfile=%s level=0 %s| %s -r %s -t %d -I bam -O cram %s | tee %s | %s index - %s.crai}; - -const my $LANE_BAMBAM_MERGE => q{%s SO=%s %s tmpfile=%s level=0 | pee '%s tmpfile=%s index=1 md5=1 numthreads=%d md5filename=%s.md5 indexfilename=%s.%s > %s' '%s -o %s.bas -@ %d'}; -const my $LANE_BAMBAM_MERGE_CRAM => q{%s SO=%s %s tmpfile=%s level=0 | %s -r %s -t %d -I bam -O cram %s | tee %s | %s index - %s.crai}; -const my $LANE_BAMBAM_MERGE_NOIDX => q{%s SO=%s %s tmpfile=%s level=0 | pee '%s tmpfile=%s md5=1 numthreads=%d md5filename=%s.md5 > %s' '%s -o %s.bas -@ %d'}; -const my $LANE_BAMBAM_MERGE_CRAM_NOIDX => q{%s SO=%s %s tmpfile=%s level=0 | %s -r %s -t %d -I bam -O cram %s %s}; - -const my $LANE_BAMBAM_DUP => q{%s level=0 %s | %s -l 0 -m | %s tmpfile=%s level=0 markthreads=%d M=%s.met | %s -l 0 -p | pee '%s tmpfile=%s index=1 md5=1 numthreads=%d md5filename=%s.md5 indexfilename=%s.%s > %s' '%s -o %s.bas -@ %d'}; -const my $LANE_BAMBAM_DUP_CRAM => q{%s level=0 %s | %s -l 0 -m | %s tmpfile=%s level=0 markthreads=%d M=%s.met | %s -l 0 -p | %s -r %s -t %d -I bam -O cram %s | tee %s | %s index - %s.crai}; const my $CRAM_CHKSUM => q{md5sum %s | perl -ne '/^(\S+)/; print "$1";' > %s.md5}; const my $BAM_STATS => q{ -i %s -o %s -@ %d}; @@ -107,143 +94,81 @@ sub bam_to_grouped_bam { } sub merge_or_mark_lanes { - my ($options, @sorted_bams) = @_; + my ($options, @bams) = @_; my $tmp = $options->{'tmp'}; - my $marked = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}); + my $marked = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}); if($options->{'cram'}) { $marked .= '.cram'; } else { $marked .= '.bam'; } - my @commands; return $marked if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - my $helper_threads = $options->{'threads'}-1; - $helper_threads = 1 if($helper_threads < 1); - my $input_str = ' I='.join(' I=', sort @sorted_bams); + my @commands = ('set -o pipefail'); + + my $helper_threads = $options->{'threads'}; - my $bbb_tmp = File::Spec->catfile($tmp, 'biormdup'); + my $input_str = join q{ }, sort @bams; + + my $strmd_tmp = File::Spec->catfile($tmp, 'strmdup'); my $brc_tmp = File::Spec->catfile($tmp, 'brcTmp'); my %tools; - for my $tool(qw(bam_stats bammerge bammarkduplicates2 bamrecompress scramble samtools mmFlagModifier)) { + for my $tool(qw(bam_stats samtools md5sum)) { $tools{$tool} = _which($tool) || die "Unable to find '$tool' in path"; } + my $out_fmt = 'bam'; my $idx_type = 'bai'; - $idx_type = 'csi' if(exists $options->{'csi'}); + my $idx_csi_flag = q{}; + if($options->{'cram'}) { + $idx_type = 'crai'; + $out_fmt = 'cram'; + $out_fmt .= ',seqs_per_slice='.$options->{'seqslice'}; + } + elsif(exists $options->{'csi'}) { + # only valid for bam + $idx_type = 'csi'; + $idx_csi_flag = '-c'; + } - # subtly different, as need to strip mmQc before dup-rem, and then reapply if(defined $options->{'nomarkdup'} && $options->{'nomarkdup'} == 1) { - if(defined $options->{'noindex'} && $options->{'noindex'} == 1){ - if($options->{'cram'}) { - my $add_sc = $options->{'scramble'} || q{}; - $commands[0] = sprintf $LANE_BAMBAM_MERGE_CRAM_NOIDX, - $tools{'bammerge'}, - $options->{'sortorder'}, - $input_str, - $bbb_tmp, - $tools{'scramble'}, - $options->{'reference'}, - $helper_threads, - $add_sc, - $marked; - } - else { - $commands[0] = sprintf $LANE_BAMBAM_MERGE_NOIDX, - $tools{'bammerge'}, - $options->{'sortorder'}, - $input_str, - $bbb_tmp, - $tools{'bamrecompress'}, - $brc_tmp, - $helper_threads, - $marked, - $marked, - $tools{'bam_stats'}, - $marked, - $helper_threads; - } + my $idx = q{}; + unless($options->{'noindex'}) { + $idx = sprintf q{%s index -@ %d %s - %s.%s}, + $tools{samtools}, $helper_threads, $idx_csi_flag, $marked, $idx_type; } - else{ - if($options->{'cram'}) { - my $add_sc = $options->{'scramble'} || q{}; - $commands[0] = sprintf $LANE_BAMBAM_MERGE_CRAM, - $tools{'bammerge'}, - $options->{'sortorder'}, - $input_str, - $bbb_tmp, - $tools{'scramble'}, - $options->{'reference'}, - $helper_threads, - $add_sc, - $marked, - $tools{'samtools'}, - $marked; - } - else { - $commands[0] = sprintf $LANE_BAMBAM_MERGE, - $tools{'bammerge'}, - $options->{'sortorder'}, - $input_str, - $bbb_tmp, - $tools{'bamrecompress'}, - $brc_tmp, - $helper_threads, - $marked, - $marked, $idx_type, - $marked, - $tools{'bam_stats'}, - $marked, - $helper_threads; - } - } + + my $namesrt = q{}; + $namesrt = q{-n} if($options->{'qnamesort'}); + + my $merge = sprintf q{%s merge %s -u -@ %d - %s}, + $tools{samtools}, $namesrt, $helper_threads, $input_str; + my $compress = sprintf q{%s view -T %s --output-fmt %s -@ %d -}, + $tools{samtools}, $options->{reference}, $out_fmt, $helper_threads; + my $md5 = sprintf q{%s -b > %s.md5}, + $tools{md5sum}, $marked; + my $stats = sprintf q{%s -o %s.bas -@ %d}, + $tools{bam_stats}, $marked, $helper_threads; + push @commands, qq{$merge | pee "$stats" "$compress | pee '$idx' '$md5' 'cat > $marked'"}; } else { - if($options->{'cram'}) { - my $add_sc = $options->{'scramble'} || q{}; - $commands[0] = sprintf $LANE_BAMBAM_DUP_CRAM, - $tools{'bammerge'}, - $input_str, - $tools{'mmFlagModifier'}, - $tools{'bammarkduplicates2'}, - $bbb_tmp, - $helper_threads, - $marked, - $tools{'mmFlagModifier'}, - $tools{'scramble'}, - $options->{'reference'}, - $helper_threads, - $add_sc, - $marked, - $tools{'samtools'}, - $marked; - } - else { - $commands[0] = sprintf $LANE_BAMBAM_DUP, - $tools{'bammerge'}, - $input_str, - $tools{'mmFlagModifier'}, - $tools{'bammarkduplicates2'}, - $bbb_tmp, - $helper_threads, - $marked, - $tools{'mmFlagModifier'}, - $tools{'bamrecompress'}, - $brc_tmp, - $helper_threads, - $marked, - $marked, $idx_type, - $marked, - $tools{'bam_stats'}, - $marked, - $helper_threads; - } + my $merge = sprintf q{%s merge -u -@ %d - %s}, + $tools{samtools}, $helper_threads, $input_str; + my $markdup = sprintf q{%s markdup --mode %s --output-fmt bam,level=0 -S --include-fails -T %s -@ %d -f %s.met - -}, + $tools{samtools}, $options->{dupmode}, $strmd_tmp, $helper_threads, $marked; + my $compress = sprintf q{%s view -T %s --output-fmt %s -@ %d -}, + $tools{samtools}, $options->{reference}, $out_fmt, $helper_threads; + my $idx = sprintf q{%s index -@ %d %s - %s.%s}, + $tools{samtools}, $helper_threads, $idx_csi_flag, $marked, $idx_type; + my $md5 = sprintf q{%s -b > %s.md5}, + $tools{md5sum}, $marked; + my $stats = sprintf q{%s -o %s.bas -@ %d}, + $tools{bam_stats}, $marked, $helper_threads; + push @commands, qq{$merge | $markdup | pee "$compress | pee 'cat > $marked' '$idx' '$md5'" "$stats" }; } if($options->{'cram'}) { - unshift @commands, 'set -o pipefail'; push @commands, sprintf $CRAM_CHKSUM, $marked, $marked; - push @commands, 'set +o pipefail'; } PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); @@ -255,135 +180,95 @@ sub merge_and_mark_dup { # uncoverable subroutine my ($options, $source) = @_; my $tmp = $options->{'tmp'}; - my $marked = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}); - - my @commands; - if($options->{'cram'}) { - $marked .= '.cram'; - } - else { - $marked .= '.bam'; - } + my $marked = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}); + if($options->{'cram'}) { $marked .= '.cram'; } + else { $marked .= '.bam'; } return $marked if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - my $helper_threads = $options->{'threads'}-1; - $helper_threads = 1 if($helper_threads < 1); + my @commands = ('set -o pipefail'); - my @sorted_bams; - my $input_str = q{}; + my $helper_threads = $options->{'threads'}; + + my @bams; if(defined $source) { opendir(my $dh, $source); while(my $file = readdir $dh) { next unless($file =~ m/_sorted\.bam$/); - push @sorted_bams, File::Spec->catfile($source, $file); + push @bams, File::Spec->catfile($source, $file); } closedir $dh; } else { for(@{$options->{'meta_set'}}) { - push @sorted_bams, $_->tstub.'_sorted.bam'; + push @bams, $_->tstub.'_sorted.bam'; } } - $input_str = ' I='.join(' I=', sort @sorted_bams); - my $bbb_tmp = File::Spec->catfile($tmp, 'biormdup'); + my $input_str = join q{ }, sort @bams; + + my $strmd_tmp = File::Spec->catfile($tmp, 'strmdup'); my $brc_tmp = File::Spec->catfile($tmp, 'brcTmp'); my %tools; - for my $tool(qw(bammerge bammarkduplicates2 bamrecompress scramble samtools bam_stats mismatchQc)) { + for my $tool(qw(samtools bam_stats mismatchQc md5sum)) { $tools{$tool} = _which($tool) || die "Unable to find '$tool' in path"; } my $mismatchQc = q{}; if(defined $options->{'mmqc'}) { - $mismatchQc = sprintf $MISMATCHQC, + $mismatchQc = sprintf q{ | %s -l 0 -t %.2f -p}, $tools{'mismatchQc'}, $options->{'mmqcfrac'}; } + my $out_fmt = 'bam'; my $idx_type = 'bai'; - $idx_type = 'csi' if(exists $options->{'csi'}); + my $idx_csi_flag = q{}; + if($options->{'cram'}) { + $idx_type = 'crai'; + $out_fmt = 'cram'; + $out_fmt .= ',seqs_per_slice='.$options->{'seqslice'}; + } + elsif(exists $options->{'csi'}) { + # only valid for bam + $idx_type = 'csi'; + $idx_csi_flag = '-c'; + } if(defined $options->{'nomarkdup'} && $options->{'nomarkdup'} == 1) { - if($options->{'cram'}) { - my $add_sc = $options->{'scramble'} || q{}; - $commands[0] = sprintf $BAMBAM_MERGE_CRAM, - $tools{'bammerge'}, - $input_str, - $bbb_tmp, - $mismatchQc, - $tools{'scramble'}, - $options->{'reference'}, - $helper_threads, - $add_sc, - $marked, - $tools{'samtools'}, - $marked; - } - else { - $commands[0] = sprintf $BAMBAM_MERGE, - $tools{'bammerge'}, - $input_str, - $bbb_tmp, - $mismatchQc, - $tools{'bamrecompress'}, - $brc_tmp, - $helper_threads, - $marked, - $marked, $idx_type, - $marked, - $tools{'bam_stats'}, - $marked, - $helper_threads; - } + my $merge = sprintf q{%s merge -u -@ %d - %s}, + $tools{samtools}, $helper_threads, $input_str; + my $compress = sprintf q{%s view -T %s --output-fmt %s -@ %d -}, + $tools{samtools}, $options->{reference}, $out_fmt, $helper_threads; + my $idx = sprintf q{%s index -@ %d %s - %s.%s}, + $tools{samtools}, $helper_threads, $idx_csi_flag, $marked, $idx_type; + my $md5 = sprintf q{%s -b > %s.md5}, + $tools{md5sum}, $marked; + my $stats = sprintf q{%s -o %s.bas -@ %d}, + $tools{bam_stats}, $marked, $helper_threads; + push @commands, qq{$merge $mismatchQc | pee "$stats" "$compress | pee '$idx' '$md5' 'cat > $marked'"}; } else { - if($options->{'cram'}) { - my $add_sc = $options->{'scramble'} || q{}; - $commands[0] = sprintf $BAMBAM_DUP_CRAM, - $tools{'bammerge'}, - $input_str, - $tools{'bammarkduplicates2'}, - $bbb_tmp, - $marked, - $helper_threads, - $mismatchQc, - $tools{'scramble'}, - $options->{'reference'}, - $helper_threads, - $add_sc, - $marked, - $tools{'samtools'}, - $marked; - } - else { - $commands[0] = sprintf $BAMBAM_DUP, - $tools{'bammerge'}, - $input_str, - $tools{'bammarkduplicates2'}, - $bbb_tmp, - $helper_threads, - $marked, - $mismatchQc, - $tools{'bamrecompress'}, - $brc_tmp, - $helper_threads, - $marked, - $marked, $idx_type, - $marked, - $tools{'bam_stats'}, - $marked, - $helper_threads; - } + my $merge = sprintf q{%s merge -u -@ %d - %s}, + $tools{samtools}, $helper_threads, $input_str; + my $markdup = sprintf q{%s markdup --mode %s --output-fmt bam,level=0 -S --include-fails -T %s -@ %d -f %s.met - -}, + $tools{samtools}, $options->{dupmode}, $strmd_tmp, $helper_threads, $marked; + my $compress = sprintf q{%s view -T %s --output-fmt %s -@ %d -}, + $tools{samtools}, $options->{reference}, $out_fmt, $helper_threads; + my $idx = sprintf q{%s index -@ %d %s - %s.%s}, + $tools{samtools}, $helper_threads, $idx_csi_flag, $marked, $idx_type; + my $md5 = sprintf q{%s -b > %s.md5}, + $tools{md5sum}, $marked; + my $stats = sprintf q{%s -o %s.bas -@ %d}, + $tools{bam_stats}, $marked, $helper_threads; + push @commands, qq{$merge $mismatchQc | $markdup | pee "$compress | pee 'cat > $marked' '$idx' '$md5'" "$stats" }; } if($options->{'cram'}) { - unshift @commands, 'set -o pipefail'; push @commands, sprintf $CRAM_CHKSUM, $marked, $marked; - push @commands, 'set +o pipefail'; } PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); @@ -395,17 +280,12 @@ sub bam_stats { # uncoverable subroutine my $options = shift; my $tmp = $options->{'tmp'}; + # leagacy method, not needed my $ext = '.bam'; $ext = '.cram' if($options->{'cram'}); my $xam = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}).$ext; my $bas = "$xam.bas"; return $bas if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - if($options->{'cram'}) { - # Only needed for cram output - my $command = _which('bam_stats') || die "Unable to find 'bam_stats' in path"; - $command .= sprintf $BAM_STATS, $xam, $bas, $options->{'threads'}; - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); - } PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return $bas; } diff --git a/lib/PCAP/Bwa.pm b/lib/PCAP/Bwa.pm index 3ef6b97..a963fb9 100644 --- a/lib/PCAP/Bwa.pm +++ b/lib/PCAP/Bwa.pm @@ -27,7 +27,7 @@ use autodie qw(:all); use English qw( -no_match_vars ); use warnings FATAL => 'all'; use Const::Fast qw(const); -use File::Path qw(make_path); +use File::Path qw(make_path remove_tree); use File::Spec; use Capture::Tiny qw(capture); use File::Copy qw(copy); @@ -36,11 +36,9 @@ use PCAP::Bwa::Meta; use PCAP::Bam; const my $BWA_ALN => q{ aln%s -t %s -f %s_%s.sai %s %s.%s}; -const my $BAMFASTQ => q{%s view -F 2816 -T %s -u %s| %s exclude=QCFAIL,SECONDARY,SUPPLEMENTARY tryoq=1 gz=1 level=1 outputperreadgroup=1 outputperreadgroupsuffixF=_i.fq outputperreadgroupsuffixF2=_i.fq T=%s outputdir=%s split=%s}; -const my $CRAMFASTQ => q{%s reference=%s inputformat=cram exclude=QCFAIL,SECONDARY,SUPPLEMENTARY tryoq=1 gz=1 level=1 outputperreadgroup=1 outputperreadgroupsuffixF=_i.fq outputperreadgroupsuffixF2=_i.fq T=%s outputdir=%s split=%s filename=%s}; -const my $BWA_MEM => q{ mem %s %s -R %s -t %s %s}; const my $ALN_TO_SORTED => q{ sampe -P -a 1000 -r '%s' %s %s_1.sai %s_2.sai %s.%s %s.%s | %s fixmate=1 inputformat=sam level=1 tmpfile=%s_tmp O=%s_sorted.bam}; -const my $BAMSORT => q{ fixmate=1 inputformat=sam level=1 tmpfile=%s_tmp O=%s_sorted.bam outputthreads=%s calmdnm=1 calmdnmrecompindetonly=1 calmdnmreference=%s sortthreads=%s}; + +const my $TAG_STRIP => q{-x AM -x MD -x NM -x RT -x SM -x X0 -x X1 -x XA -x XG -x XM -x XN -x XO -x XT -x mm}; const my $FALSE_RG => q{@RG\tID:%s\tSM:%s\tLB:default\tPL:ILLUMINA}; @@ -92,9 +90,6 @@ sub mem_setup { copy("$options->{reference}.fai", "$options->{tmp}/decomp.fa.fai") unless(-e "$options->{decomp_ref}.fai"); } } - # do some checking to ensure input BAM/CRAM hasn't been through mismatchQc - # if it has check for use of at least bammaskflags - PCAP::Bam::mismatchQc_checks($options->{'raw_files'}) unless($skip_mmqc_check); return 1; } @@ -136,6 +131,7 @@ sub mem_mapmax { next if($file =~ m/^\./); next if($file =~ m/^pairedfq2\.[[:digit:]]+/); # captured by 1.* next if($file =~ m/s[.]fq[.]gz_[[:digit:]]+[.]gz$/); + next if($file eq 'unknown.bam'); if($file =~ m/o[12][.]fq[.]gz_[[:digit:]]+[.]gz$/) { warn "Orphan reads found, your input BAM appears to have had duplicates 'removed' rather than 'marked': $folder/$file\n\tWARNING: This will give a sub-optimal result\n"; next; @@ -171,7 +167,6 @@ sub split_in { make_path($split_folder) unless(-d $split_folder); make_path($sort_folder) unless(-d $sort_folder); - my $fragment_size = $options->{'fragment'}; $fragment_size ||= $READPAIR_SPLITSIZE; @@ -213,28 +208,19 @@ sub split_in { } # if bam|cram input else { - my $bam2fq = _which('bamtofastq') || die "Unable to find 'bamtofastq' in path"; - my $cmd; - if($input->bam_or_cram eq 'cram') { - $cmd = sprintf $CRAMFASTQ, $bam2fq, - $options->{'reference'}, - File::Spec->catfile($tmp, "bamtofastq.$index"), - $split_folder, - $fragment_size * $MILLION * $BAM_MULT, - $input->in; - } - else { - my $samtools = _which('samtools') || die "Unable to find 'samtools' in path"; - $cmd = sprintf $BAMFASTQ, $samtools, - $options->{'reference'}, - $input->in, - $bam2fq, - File::Spec->catfile($tmp, "bamtofastq.$index"), - $split_folder, - $fragment_size * $MILLION * $BAM_MULT; - } + my $helpers = $options->{threads_per_split}; + my $collate_folder = File::Spec->catdir($options->{'tmp'}, 'collate', $index); + make_path($collate_folder) unless(-d $collate_folder); + my $samtools = _which('samtools') || die "Unable to find 'samtools' in path"; + my $mmQcStrip = sprintf '%s --remove -l 0 -@ %d -i %s', _which('mmFlagModifier'), $helpers, $input->in; + my $view = sprintf '%s view %s -bu -T %s -F 2816 -@ %d -', $samtools, $TAG_STRIP, $options->{'reference'}, $helpers; # leave + my $collate = sprintf '%s collate -Ou -@ %d - %s/collate', $samtools, $helpers, $collate_folder; + my $split = sprintf '%s split --output-fmt bam,level=1 -@ %d -u %s/unknown.bam -f %s/%%!_i.bam -', $samtools, $helpers, $split_folder, $split_folder; + my $cmd = sprintf '%s | %s | %s | %s', $mmQcStrip, $view, $collate, $split; # treat as interleaved fastq + push @commands, 'set -o pipefail'; push @commands, $cmd; + push @commands, "rm -rf $collate_folder"; # cleanup temp folder } PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, $index); @@ -250,6 +236,7 @@ sub bwa_mem { my $tmp = $options->{'tmp'}; return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); + my $input_meta = $options->{'meta_set'}; my $to_map = $options->{'to_map'}; my $iter = 1; @@ -272,7 +259,7 @@ sub bwa_mem { $rg_line = q{'}.$input->rg_header(q{\t}).q{'}; } else { - my ($rg) = $split =~ m|/split/[[:digit:]]+/(.+)_i.fq_[[:digit:]]+.gz$|; + my ($rg) = $split =~ m{/split/[[:digit:]]+/(.+)_i\.(?:fq_[[:digit:]]+\.)?(?:gz|bam)$}; ($rg_line, undef) = PCAP::Bam::rg_line_for_output($input->in, $options->{'sample'}, undef, $rg); if($rg_line) { $rg_line =~ s/('+)/'"$1"'/g; @@ -293,12 +280,18 @@ sub bwa_mem { else { if(exists $options->{'bwa_pl'}) { $bwa .= 'LD_PRELOAD='.$options->{'bwa_pl'}.' '; + } elsif(exists $ENV{GPERF_FOR_BWA}) { + $bwa .= 'LD_PRELOAD='.$ENV{GPERF_FOR_BWA}.' '; } $bwa .= _which('bwa') || die "Unable to find 'bwa' in path"; } $ENV{SHELL} = '/bin/bash'; # ensure bash to allow pipefail - my $command = 'set -o pipefail; '; + + my %tools; + for my $tool(qw(samtools reheadSQ)) { + $tools{$tool} = _which($tool) || die "Unable to find '$tool' in path"; + } my $interleaved_fq = q{}; # uncoverable branch true @@ -307,41 +300,40 @@ sub bwa_mem { my $add_options = q{-v 1}; # minimise output $add_options = $options->{'bwa'} if(exists $options->{'bwa'}); - $bwa .= sprintf $BWA_MEM, $add_options, $interleaved_fq, $rg_line, $threads, $options->{'reference'}; + $bwa .= sprintf q{ mem %s %s -R %s -t %s %s}, $add_options, $interleaved_fq, $rg_line, $threads, $options->{'reference'}; # uncoverable branch true # uncoverable branch false - if($input->paired_fq) { - $split =~ s/'/\\'/g; - my $split2 = $split; - $split2 =~ s/pairedfq1(\.[[:digit:]]+)/pairedfq2$1/; + $split =~ s/'/\\'/g; + if($input->fastq) { $bwa .= ' '.$split; - $bwa .= ' '.$split2; + if($input->paired_fq) { + my $split2 = $split; + $split2 =~ s/pairedfq1(\.[[:digit:]]+)/pairedfq2$1/; + $bwa .= ' '.$split2; + } } else { - $split =~ s/'/\\'/g; - $bwa .= ' '.$split; + # bam/cram + my $tofastq = sprintf '%s fastq -@ %d -N %s', $tools{samtools}, $threads, $split; + $bwa = sprintf '%s | %s /dev/stdin', $tofastq, $bwa; } - $command .= $bwa; - - # now add the code for reheadSQ - my $rehead_sq = sprintf '%s -d %s', _which('reheadSQ'), $options->{'dict'}; - $command .= ' | '.$rehead_sq; - - my $helpers = 1; - # uncoverable branch true - # uncoverable branch false - $helpers = $threads - 1 if($threads > 1); my $sorted_bam_stub = $split; $sorted_bam_stub =~ s|/split/([[:digit:]]+)/(.+)$|/sorted/$1_$2|; $sorted_bam_stub =~ s/\\'/-/g; my $ref = exists $options->{'decomp_ref'} ? $options->{'decomp_ref'} : $options->{'reference'}; - my $sort = _which('bamsort') || die "Unable to find 'bamsort' in path\n"; - $sort .= sprintf $BAMSORT, File::Spec->catfile($tmp, "bamsort.$index"), $sorted_bam_stub, $helpers, $ref, $helpers; - $command .= " | $sort"; + my $rehead_sq = sprintf '%s -d %s', + $tools{reheadSQ}, $options->{'dict'}; + my $fixmate = sprintf q{%s fixmate -m --output-fmt bam,level=0 -@ %d - -}, + $tools{samtools}, $threads; + my $sort = sprintf q{%s sort -m 2G --output-fmt bam,level=0 -T %s_tmp -@ %d -}, + $tools{samtools}, File::Spec->catfile($tmp, "bamsort.$index"), $threads; + my $calmd = sprintf q{%s calmd --output-fmt bam,level=1 -Q -@ %d - %s > %s_sorted.bam}, + $tools{samtools}, $threads, $ref, $sorted_bam_stub; + my $command .= "set -o pipefail; $bwa | $rehead_sq | $fixmate | $sort | $calmd"; PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); diff --git a/prerelease.sh b/prerelease.sh deleted file mode 100755 index 92ac79c..0000000 --- a/prerelease.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash - -set -eu # exit on first error or undefined value in subtitution -set -o pipefail - -# get current directory -INIT_DIR=`pwd` - -rm -rf blib - -# get location of this file -MY_PATH="`dirname \"$0\"`" # relative -MY_PATH="`( cd \"$MY_PATH\" && pwd )`" # absolutized and normalized -if [ -z "$MY_PATH" ] ; then - # error; for some reason, the path is not accessible - # to the script (e.g. permissions re-evaled after suid) - echo Failed to determine location of script >2 - exit 1 # fail -fi -# change into the location of the script -cd $MY_PATH - -echo -e '\n### Compile/Test C ###\n' -set +u -if [ "x$HTSLIB" == "x" ] || [ "x$LIBBIGWIG" == "x" ]; then - echo -e '\n\t$HTSLIB or $LIBBIGWIG not defined skipping C compile/test\n' -else - make -C c clean - make -C c - make -C c clean -fi -set -u - -echo -e '\n\n### Running perl tests ###\n' - -export HARNESS_PERL_SWITCHES=-MDevel::Cover=-db,reports,-select='^lib/*\.pm$',-ignore,'^t/' -rm -rf reports docs pm_to_blib blib -cover -delete -mkdir -p docs/reports_text -prove -w -j 9 -I ./lib - -echo -e '\n\n### Generating test/pod coverage reports ###\n' -# removed 'condition' from coverage as '||' 'or' doesn't work properly -cover -coverage branch,subroutine,pod -report_c0 50 -report_c1 85 -report_c2 100 -report html_basic reports -silent |& grep -v '^Perltidy' | grep -v '^##' | grep -v '^1:' -# grep on last command to cleanup an oddity in perltidy -cover -coverage branch,subroutine,pod -report text reports -silent > docs/reports_text/coverage.txt -rm -rf reports/structure reports/digests reports/cover.13 reports/runs -cp reports/coverage.html reports/index.html -mv reports docs/reports_html -unset HARNESS_PERL_SWITCHES - -echo '### Generating POD ###' -mkdir -p docs/pod_html -perl -MPod::Simple::HTMLBatch -e 'Pod::Simple::HTMLBatch::go' lib:bin docs/pod_html > /dev/null - -echo '### Archiving docs folder ###' -tar cz -C $INIT_DIR -f docs.tar.gz docs - -# generate manifest, and cleanup -echo '### Generating MANIFEST ###' -# delete incase any files are moved, the make target just adds stuff -rm -f MANIFEST -# cleanup things which could break the manifest -rm -rf install_tmp -perl Makefile.PL > /dev/null -make manifest &> /dev/null -rm -f Makefile MANIFEST.bak pm_to_blib - -# change back to original dir -cd $INIT_DIR diff --git a/setup.sh b/setup.sh index 8aabf6d..1dfb91f 100755 --- a/setup.sh +++ b/setup.sh @@ -1,279 +1,62 @@ #!/bin/bash -SOURCE_BWA="https://github.com/lh3/bwa/archive/v0.7.17.tar.gz" -SOURCE_BWAMEM2="https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.0pre2/bwa-mem2-2.0pre2_x64-linux.tar.bz2" - -# for bamstats and Bio::DB::HTS -SOURCE_HTSLIB="https://github.com/samtools/htslib/releases/download/1.7/htslib-1.7.tar.bz2" -SOURCE_SAMTOOLS="https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2" - -# Bio::DB::HTS -SOURCE_BIOBDHTS="https://github.com/Ensembl/Bio-HTS/archive/2.10.tar.gz" - -# for biobambam -SOURCE_BBB_BIN_DIST="https://github.com/gt1/biobambam2/releases/download/2.0.86-release-20180228171821/biobambam2-2.0.86-release-20180228171821-x86_64-etch-linux-gnu.tar.gz" - -get_distro () { - EXT="" - if [[ $2 == *.tar.bz2* ]] ; then - EXT="tar.bz2" - elif [[ $2 == *.zip* ]] ; then - echo "ERROR: zip archives are not supported by default, if pulling from github replace .zip with .tar.gz" - exit 1 - elif [[ $2 == *.tar.gz* ]] ; then - EXT="tar.gz" - else - echo "I don't understand the file type for $1" - exit 1 - fi - rm -f $1.$EXT - if hash curl 2>/dev/null; then - curl --retry 10 -sS -o $1.$EXT -L $2 - else - echo "ERROR: curl not found" - exit 1 - fi -} - -get_file () { -# output, source - if hash curl 2>/dev/null; then - curl -sS -o $1 -L $2 - else - wget -nv -O $1 $2 - fi -} - -if [ "$#" -ne "1" ] ; then - echo "Please provide an installation path such as /opt/ICGC" - exit 0 +# ALL tool versions used by opt-build.sh +# need to keep in sync with Dockerfile + +# newer gitlab versions do not work +export BBB2_URL="https://github.com/gt1/biobambam2/releases/download/2.0.87-release-20180301132713/biobambam2-2.0.87-release-20180301132713-x86_64-etch-linux-gnu.tar.gz" +export BWAMEM2_URL="https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.0pre2/bwa-mem2-2.0pre2_x64-linux.tar.bz2" +export STADEN="https://iweb.dl.sourceforge.net/project/staden/staden/2.0.0b11/staden-2.0.0b11-2016-linux-x86_64.tar.gz" +export VER_BIODBHTS="3.01" +export VER_BWA="v0.7.17" +export VER_HTSLIB="1.10.2" +export VER_SAMTOOLS="1.10" + + +if [[ ($# -ne 1 && $# -ne 2) ]] ; then + echo "Please provide an installation path and optionally perl lib paths to allow, e.g." + echo " ./setup.sh /opt/myBundle" + echo "OR all elements versioned:" + echo " ./setup.sh /opt/cgpPinel-X.X.X /opt/cgpVcf-X.X.X/lib/perl:/opt/PCAP-core-X.X.X/lib/perl" + exit 1 fi -set -e +INST_PATH=$1 -CPU=`grep -c ^processor /proc/cpuinfo` -if [ $? -eq 0 ]; then - if [ "$CPU" -gt "6" ]; then - CPU=6 - fi -else - CPU=1 +if [[ $# -eq 2 ]] ; then + CGP_PERLLIBS=$2 fi -echo "Max compilation CPUs set to $CPU" - -INST_PATH=$1 # get current directory INIT_DIR=`pwd` +set -e + # cleanup inst_path -mkdir -p $INST_PATH +mkdir -p $INST_PATH/bin cd $INST_PATH INST_PATH=`pwd` -mkdir -p $INST_PATH/bin cd $INIT_DIR # make sure that build is self contained -unset PERL5LIB -ARCHNAME=`perl -e 'use Config; print $Config{archname};'` PERLROOT=$INST_PATH/lib/perl5 -export PERL5LIB="$PERLROOT" -export PATH="$INST_PATH/biobambam2/bin:$INST_PATH/bin:$PATH" - -#create a location to build dependencies -SETUP_DIR=$INIT_DIR/install_tmp -mkdir -p $SETUP_DIR - -# check for cgpBigWig -if [ -e "$INST_PATH/bin/detectExtremeDepth" ]; then - echo -e "\tcgpBigWig installation found"; -else - echo -e "\tERROR: Please see README.md and install cgpBigWig"; - exit 1 -fi - -## grab cpanm and stick in workspace, then do a self upgrade into bin: -get_file $SETUP_DIR/cpanm https://cpanmin.us/ -perl $SETUP_DIR/cpanm --no-wget -l $INST_PATH App::cpanminus -CPANM=`which cpanm` -echo $CPANM - -if ! ( perl -MExtUtils::MakeMaker -e 1 >/dev/null 2>&1); then - echo - echo "WARNING: Your Perl installation does not seem to include a complete set of core modules. Attempting to cope with this, but if installation fails please make sure that at least ExtUtils::MakeMaker is installed. For most users, the best way to do this is to use your system's package manager: apt, yum, fink, homebrew, or similar." -fi - -if [ -e $SETUP_DIR/basePerlDeps.success ]; then - echo "Previously installed base perl deps..." -else - perlmods=( "ExtUtils::CBuilder" "Module::Build~0.42" "Const::Fast" "File::Which" "LWP::UserAgent" "Bio::Root::Version~1.006924") - for i in "${perlmods[@]}" ; do - $CPANM --no-wget --no-interactive --notest --mirror http://cpan.metacpan.org -l $INST_PATH $i - done - touch $SETUP_DIR/basePerlDeps.success -fi - - -echo -n "Get htslib ..." -if [ -e $SETUP_DIR/htslibGet.success ]; then - echo " already staged ..."; -else - echo - cd $SETUP_DIR - get_distro "htslib" $SOURCE_HTSLIB - touch $SETUP_DIR/htslibGet.success -fi -echo -n "Building htslib ..." -if [ -e $SETUP_DIR/htslib.success ]; then - echo " previously installed ..."; +# allows user to knowingly specify other PERL5LIB areas. +if [ -z ${CGP_PERLLIBS+x} ]; then + export PERL5LIB="$PERLROOT" else - echo - mkdir -p htslib - tar --strip-components 1 -C htslib -jxf htslib.tar.bz2 - cd htslib - ./configure --enable-plugins --enable-libcurl --prefix=$INST_PATH - make -j$CPU - make install - cd $SETUP_DIR - touch $SETUP_DIR/htslib.success + export PERL5LIB="$PERLROOT:$CGP_PERLLIBS" fi -export HTSLIB=$INST_PATH - -CHK=`perl -le 'eval "require $ARGV[0]" and print $ARGV[0]->VERSION' Bio::DB::HTS` -if [[ "x$CHK" == "x" ]] ; then - echo -n "Building Bio::DB::HTS ..." - if [ -e $SETUP_DIR/biohts.success ]; then - echo " previously installed ..."; - else - echo - cd $SETUP_DIR - rm -rf bioDbHts - get_distro "bioDbHts" $SOURCE_BIOBDHTS - mkdir -p bioDbHts - tar --strip-components 1 -C bioDbHts -zxf bioDbHts.tar.gz - cd bioDbHts - perl Build.PL --htslib=$HTSLIB --install_base=$INST_PATH - ./Build - ./Build test - ./Build install - cd $SETUP_DIR - rm -f bioDbHts.tar.gz - touch $SETUP_DIR/biohts.success - fi -else - echo "Bio::DB::HTS already installed ..." -fi - -cd $INIT_DIR - -echo -n "Building samtools ..." -if [ -e $SETUP_DIR/samtools.success ]; then - echo " previously installed ..."; -else -echo - cd $SETUP_DIR - rm -rf samtools - get_distro "samtools" $SOURCE_SAMTOOLS - mkdir -p samtools - tar --strip-components 1 -C samtools -xjf samtools.tar.bz2 - cd samtools - ./configure --enable-plugins --enable-libcurl --prefix=$INST_PATH - make -j$CPU all all-htslib - make install all all-htslib - cd $SETUP_DIR - rm -f samtools.tar.bz2 - touch $SETUP_DIR/samtools.success -fi - - -cd $SETUP_DIR -echo -n "Building BWA ..." -if [ -e $SETUP_DIR/bwa.success ]; then - echo " previously installed ..." -else - echo - get_distro "bwa" $SOURCE_BWA - mkdir -p bwa - tar --strip-components 1 -C bwa -zxf bwa.tar.gz - make -C bwa -j$CPU - cp bwa/bwa $INST_PATH/bin/. - rm -f bwa.tar.gz - touch $SETUP_DIR/bwa.success -fi - -## build BWA-mem2 (tar.gz) -cd $SETUP_DIR -echo -n "Building bwa-mem2 ..." -if [ ! -e $SETUP_DIR/bwa2.success ]; then - get_distro "bwa2" $SOURCE_BWAMEM2 - mkdir -p bwa2 - tar --strip-components 1 -C bwa2 -jxf bwa2.tar.bz2 - cp bwa2/bwa-mem2* $INST_PATH/bin/. - rm -rf bwa2.* bwa2/* - touch $SETUP_DIR/bwa2.success -fi - -echo -n "Building biobambam2 ..." -if [ -e $SETUP_DIR/biobambam2.success ]; then - echo " previously installed ..." -else -echo - cd $SETUP_DIR - get_distro "biobambam2" $SOURCE_BBB_BIN_DIST - mkdir -p $INST_PATH/biobambam2 - tar -m --strip-components 3 -C $INST_PATH/biobambam2 -zxf biobambam2.tar.gz - rm -f $INST_PATH/biobambam2/bin/curl # don't let this file in SSL doesn't work - rm -f biobambam2.tar.gz - touch $SETUP_DIR/biobambam2.success -fi - -cd $INIT_DIR - -echo -n "Building PCAP-c ..." -if [ -e $SETUP_DIR/bam_stats.success ]; then - echo " previously installed ..."; -else - echo - cd $INIT_DIR - make -C c clean - if [ -z ${REF_PATH+x} ]; then - export REF_CACHE=$INIT_DIR/t/data/ref_cache/%2s/%2s/%s - export REF_PATH=$REF_CACHE - fi - env HTSLIB=$SETUP_DIR/htslib make -C c -j$CPU prefix=$INST_PATH - cp bin/bam_stats $INST_PATH/bin/. - cp bin/reheadSQ $INST_PATH/bin/. - cp bin/diff_bams $INST_PATH/bin/. - cp bin/mismatchQc $INST_PATH/bin/. - cp bin/mmFlagModifier $INST_PATH/bin/. - touch $SETUP_DIR/bam_stats.success - make -C c clean -fi - -cd $INIT_DIR - -echo -n "Building PCAP_perlPrereq ..." -if [ -e $SETUP_DIR/PCAP_perlPrereq.success ]; then - echo "PCAP_perlPrereq previously installed ..."; -else - echo - $CPANM --no-wget --no-interactive --notest --mirror http://cpan.metacpan.org --notest -l $INST_PATH --installdeps . - touch $SETUP_DIR/PCAP_perlPrereq.success -fi - -echo -n "Installing PCAP ..." -$CPANM --no-wget -v --no-interactive --mirror http://cpan.metacpan.org -l $INST_PATH . -echo +export OPT=$INST_PATH -# cleanup all junk -rm -rf $SETUP_DIR +bash build/opt-build.sh $INST_PATH +bash build/opt-build-local.sh $INST_PATH echo echo echo "Please add the following to beginning of path:" -echo " $INST_PATH/biobambam2/bin:$INST_PATH/bin" +echo " $INST_PATH/bin" echo "Please add the following to beginning of PERL5LIB:" echo " $PERLROOT" echo diff --git a/t/3_external_progs.t b/t/3_external_progs.t index e329960..8f53e5b 100644 --- a/t/3_external_progs.t +++ b/t/3_external_progs.t @@ -32,9 +32,10 @@ use Data::Dumper; use version 0.77; const my @REQUIRED_PROGRAMS => qw(bamcollate2 bammarkduplicates2 bamsort bwa samtools); -const my $BIOBAMBAM2_VERSION => '2.0.86'; -const my $BWA_VERSION => '0.7.12'; -const my $SAMTOOLS_VERSION => '1.7'; +# left-pad all subelements to 3 digits +const my $BIOBAMBAM2_VERSION => '2.000.086'; # 2.0.86 +const my $BWA_VERSION => '0.007.012'; # 0.7.12 +const my $SAMTOOLS_VERSION => '1.010'; # 1.10 # can't put regex in const my %EXPECTED_VERSION = ( @@ -89,7 +90,14 @@ subtest 'External programs have expected version' => sub { } my $reg = $details->{'match'}; - my ($version) = $stream =~ /$reg/m; + my $version; + # have to fix problem numbering + ($version) = $stream =~ /$reg/m; + my ($maj, $min, $hot) = split /\./, $version; + $version = sprintf '%d.%03d', $maj, $min; + if(defined $hot) { + $version .= sprintf '.%03d', $hot; + } version->parse($version); ok(version->parse($version) >= $details->{'version'}, sprintf 'Expect minimum version of %s for %s, got %s', $details->{'version'}, $prog, $version);