feat: code polishing

nf-core · Jun 8, 2021 · 68e1cc4 · 68e1cc4
1 parent 0dd07ea
commit 68e1cc4
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 59 deletions.
diff --git a/bin/concatenateVCFs.sh b/bin/concatenateVCFs.sh
@@ -8,27 +8,27 @@ usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension
 
 while [[ $# -gt 0 ]]
 do
-  key=$1
-  case $key in
+	key=$1
+	case $key in
 		-i)
 			genomeIndex=$2
 			shift # past argument
-	    shift # past value
+			shift # past value
 			;;
 		-c)
 			cpus=$2
 			shift # past argument
-	    shift # past value
+			shift # past value
 			;;
 		-o)
 			outputFile=$2
 			shift # past argument
-	    shift # past value
+			shift # past value
 			;;
 		-t)
 			targetBED=$2
 			shift # past argument
-	    shift # past value
+			shift # past value
 			;;
 		-n)
 			noInt=1
@@ -47,7 +47,7 @@ if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi
 
 
 if [ -z ${noInt+x} ]
-	then
+then
 	# First make a header from one of the VCF
 	# Remove interval information from the GATK command-line, but leave the rest
 	FIRSTVCF=$(set +o pipefail; ls *.vcf | head -n 1)
@@ -62,36 +62,36 @@ if [ -z ${noInt+x} ]
 
 	# Concatenate VCFs in the correct order
 	(
-	  cat header
+		cat header
 
-	  for chr in "${CONTIGS[@]}"; do
-	    # Skip if globbing would not match any file to avoid errors such as
-	    # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
-	    # was not processed.
-	    pattern="${chr}_*.vcf"
-	    if ! compgen -G "${pattern}" > /dev/null; then continue; fi
+		for chr in "${CONTIGS[@]}"; do
+			# Skip if globbing would not match any file to avoid errors such as
+			# "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
+			# was not processed.
+			pattern="${chr}_*.vcf"
+			if ! compgen -G "${pattern}" > /dev/null; then continue; fi
 
-	    # ls -v sorts by numeric value ("version"), which means that chr1_100_
-	    # is sorted *after* chr1_99_.
-	    for vcf in $(ls -v ${pattern}); do
-	      # Determine length of header.
-	      # The 'q' command makes sed exit when it sees the first non-header
-	      # line, which avoids reading in the entire file.
-	      L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)
-	
-	      # Then print all non-header lines. Since tail is very fast (nearly as
-	      # fast as cat), this is way more efficient than using a single sed,
-	      # awk or grep command.
-	      tail -n +$((L+1)) ${vcf}
-	    done
-	  done
+			# ls -v sorts by numeric value ("version"), which means that chr1_100_
+			# is sorted *after* chr1_99_.
+			for vcf in $(ls -v ${pattern}); do
+				# Determine length of header.
+				# The 'q' command makes sed exit when it sees the first non-header
+				# line, which avoids reading in the entire file.
+				L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)
+
+				# Then print all non-header lines. Since tail is very fast (nearly as
+				# fast as cat), this is way more efficient than using a single sed,
+				# awk or grep command.
+				tail -n +$((L+1)) ${vcf}
+			done
+		done
 	) | bgzip -@${cpus} > rawcalls.vcf.gz
 	tabix rawcalls.vcf.gz
 else
-        VCF=$(ls no_intervals*.vcf)
-        cp $VCF rawcalls.vcf
-		bgzip -@${cpus} rawcalls.vcf
-        tabix rawcalls.vcf.gz
+	VCF=$(ls no_intervals*.vcf)
+	cp $VCF rawcalls.vcf
+	bgzip -@${cpus} rawcalls.vcf
+	tabix rawcalls.vcf.gz
 fi
 
 set +u

diff --git a/conf/genomes.config b/conf/genomes.config
@@ -79,5 +79,15 @@ params {
     'custom' {
       fasta                   = null
     }
+    'small_hg38' {
+      dbsnp                   = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
+      fasta                   = "${params.genomes_base}/data/genomics/homo_sapiens/genome/genome.fasta"
+      fasta_fai               = "${params.genomes_base}/data/genomics/homo_sapiens/genome/genome.fasta.fai"
+      germline_resource       = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
+      known_indels            = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
+      snpeff_db               = 'GRCh38.86'
+      species                 = 'homo_sapiens'
+      vep_cache_version       = '99'
+    }
   }
 }
diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml
@@ -1,30 +1,3 @@
-- name: Run bwa-mem
-  command: nextflow run main.nf -profile test,docker --aligner bwa-mem
-  tags:
-    - aligner
-    - bwa-mem
-  files:
-    - path: results/preprocessing/1234N/mapped/1234N.bam
-    - path: results/preprocessing/1234N/mapped/1234N.bam.bai
-    - path: results/preprocessing/1234N/markduplicates/1234N.md.bam
-    - path: results/preprocessing/1234N/markduplicates/1234N.md.bam.bai
-    - path: results/preprocessing/1234N/recal_table/1234N.recal.table
-    - path: results/preprocessing/1234N/recalibrated/1234N.recal.bam
-    - path: results/preprocessing/1234N/recalibrated/1234N.recal.bam.bai
-    - path: results/preprocessing/csv/markduplicates.csv
-    - path: results/preprocessing/csv/markduplicates_1234N.csv
-    - path: results/preprocessing/csv/markduplicates_no_table.csv
-    - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv
-    - path: results/preprocessing/csv/recalibrated.csv
-    - path: results/preprocessing/csv/recalibrated_1234N.csv
-    - path: results/reports/fastqc/1234N-1234N_M1
-    - path: results/reports/fastqc/1234N-1234N_M2
-    - path: results/reports/fastqc/1234N-1234N_M4
-    - path: results/reports/fastqc/1234N-1234N_M5
-    - path: results/reports/fastqc/1234N-1234N_M6
-    - path: results/reports/fastqc/1234N-1234N_M7
-    - path: results/reports/qualimap/1234N
-    - path: results/reports/samtools_stats/1234N/1234N.bam.stats
 - name: Run bwa-mem2
   command: nextflow run main.nf -profile test,docker --aligner bwa-mem2
   tags: