diff --git a/bin/concatenateVCFs.sh b/bin/concatenateVCFs.sh index b92138f80a..d1ea48cc8d 100755 --- a/bin/concatenateVCFs.sh +++ b/bin/concatenateVCFs.sh @@ -8,27 +8,27 @@ usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension while [[ $# -gt 0 ]] do - key=$1 - case $key in + key=$1 + case $key in -i) genomeIndex=$2 shift # past argument - shift # past value + shift # past value ;; -c) cpus=$2 shift # past argument - shift # past value + shift # past value ;; -o) outputFile=$2 shift # past argument - shift # past value + shift # past value ;; -t) targetBED=$2 shift # past argument - shift # past value + shift # past value ;; -n) noInt=1 @@ -47,7 +47,7 @@ if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi if [ -z ${noInt+x} ] - then +then # First make a header from one of the VCF # Remove interval information from the GATK command-line, but leave the rest FIRSTVCF=$(set +o pipefail; ls *.vcf | head -n 1) @@ -62,36 +62,36 @@ if [ -z ${noInt+x} ] # Concatenate VCFs in the correct order ( - cat header + cat header - for chr in "${CONTIGS[@]}"; do - # Skip if globbing would not match any file to avoid errors such as - # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 - # was not processed. - pattern="${chr}_*.vcf" - if ! compgen -G "${pattern}" > /dev/null; then continue; fi + for chr in "${CONTIGS[@]}"; do + # Skip if globbing would not match any file to avoid errors such as + # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 + # was not processed. + pattern="${chr}_*.vcf" + if ! compgen -G "${pattern}" > /dev/null; then continue; fi - # ls -v sorts by numeric value ("version"), which means that chr1_100_ - # is sorted *after* chr1_99_. - for vcf in $(ls -v ${pattern}); do - # Determine length of header. - # The 'q' command makes sed exit when it sees the first non-header - # line, which avoids reading in the entire file. - L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l) - - # Then print all non-header lines. Since tail is very fast (nearly as - # fast as cat), this is way more efficient than using a single sed, - # awk or grep command. - tail -n +$((L+1)) ${vcf} - done - done + # ls -v sorts by numeric value ("version"), which means that chr1_100_ + # is sorted *after* chr1_99_. + for vcf in $(ls -v ${pattern}); do + # Determine length of header. + # The 'q' command makes sed exit when it sees the first non-header + # line, which avoids reading in the entire file. + L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l) + + # Then print all non-header lines. Since tail is very fast (nearly as + # fast as cat), this is way more efficient than using a single sed, + # awk or grep command. + tail -n +$((L+1)) ${vcf} + done + done ) | bgzip -@${cpus} > rawcalls.vcf.gz tabix rawcalls.vcf.gz else - VCF=$(ls no_intervals*.vcf) - cp $VCF rawcalls.vcf - bgzip -@${cpus} rawcalls.vcf - tabix rawcalls.vcf.gz + VCF=$(ls no_intervals*.vcf) + cp $VCF rawcalls.vcf + bgzip -@${cpus} rawcalls.vcf + tabix rawcalls.vcf.gz fi set +u diff --git a/conf/genomes.config b/conf/genomes.config index edbbfd868c..9e74db8dde 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -79,5 +79,15 @@ params { 'custom' { fasta = null } + 'small_hg38' { + dbsnp = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" + fasta = "${params.genomes_base}/data/genomics/homo_sapiens/genome/genome.fasta" + fasta_fai = "${params.genomes_base}/data/genomics/homo_sapiens/genome/genome.fasta.fai" + germline_resource = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" + known_indels = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" + snpeff_db = 'GRCh38.86' + species = 'homo_sapiens' + vep_cache_version = '99' + } } } diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml index 60cd5d7717..437044bde4 100644 --- a/tests/test_aligner.yml +++ b/tests/test_aligner.yml @@ -1,30 +1,3 @@ -- name: Run bwa-mem - command: nextflow run main.nf -profile test,docker --aligner bwa-mem - tags: - - aligner - - bwa-mem - files: - - path: results/preprocessing/1234N/mapped/1234N.bam - - path: results/preprocessing/1234N/mapped/1234N.bam.bai - - path: results/preprocessing/1234N/markduplicates/1234N.md.bam - - path: results/preprocessing/1234N/markduplicates/1234N.md.bam.bai - - path: results/preprocessing/1234N/recal_table/1234N.recal.table - - path: results/preprocessing/1234N/recalibrated/1234N.recal.bam - - path: results/preprocessing/1234N/recalibrated/1234N.recal.bam.bai - - path: results/preprocessing/csv/markduplicates.csv - - path: results/preprocessing/csv/markduplicates_1234N.csv - - path: results/preprocessing/csv/markduplicates_no_table.csv - - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv - - path: results/preprocessing/csv/recalibrated.csv - - path: results/preprocessing/csv/recalibrated_1234N.csv - - path: results/reports/fastqc/1234N-1234N_M1 - - path: results/reports/fastqc/1234N-1234N_M2 - - path: results/reports/fastqc/1234N-1234N_M4 - - path: results/reports/fastqc/1234N-1234N_M5 - - path: results/reports/fastqc/1234N-1234N_M6 - - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N - - path: results/reports/samtools_stats/1234N/1234N.bam.stats - name: Run bwa-mem2 command: nextflow run main.nf -profile test,docker --aligner bwa-mem2 tags: