Merge pull request #17 from eastgenomics/DI-435_bash_improvements

Di 435 bash improvements
eastgenomics · Nov 26, 2024 · fcf4410 · fcf4410
2 parents db6c162 + caf8833
commit fcf4410
Showing 1 changed file with 72 additions and 34 deletions.
diff --git a/DI-435/merge_VCF_AF.sh b/DI-435/merge_VCF_AF.sh
@@ -1,54 +1,86 @@
 #!/bin/bash
+set -exo pipefail
 # Script for merging VCF files and adding AF tag
 #
 # Inputs:
 #   $1 -> input file with VCF files to merge
-#   $2 -> DNAnexus job ID for cloud workstation
-#   $3 -> Reference genome file
+#   $2 -> Reference genome file
 
 input_file=$1
-job=$2
-genome=$3
-project_file=$(awk -F ' ' '{print $3":"$4}' "$input_file")
-to_download=("$project_file")
-
-# Download all vcfs
-for i in "${to_download[@]}"
-    do
-        echo "$i"
-        dx download "$i"
-    done
+genome=$2
+
+job="${DX_JOB_ID}"
+num_proc="$(grep -c ^processor /proc/cpuinfo)"
+
+# Check if the input file exists and is not empty
+if [ ! -s "$input_file" ]; then
+    echo "ERROR: Input file '$input_file' is empty or does not exist!"
+    exit 1
+fi
+
+# Validate if the job ID is set and length is not zero
+if [ -z "$job" ]; then
+    echo "ERROR: DNAnexus job ID is not set!"
+    exit 1
+fi
+
+# Validate genome file
+if [ ! -f "$genome" ]; then
+    echo "ERROR: Reference genome file '$genome' does not exist!"
+    exit 1
+fi
+
+#blank array to store the project files
+project_files=()
+
+# Read the input file line by line and construct the project_file array
+# keep track of line number for error reporting
+line_number=0
+while IFS=$'\t' read -r _ _ field3 field4 _; do
+        ((line_number++))
+
+        # Validate fields are not empty
+        if [ -z "$field3" ] || [ -z "$field4" ]; then
+            echo "ERROR: Missing project or file ID at line $line_number"
+            exit 1
+        fi
+
+        # Validate field format (assuming they should match dx:// format)
+        if [[ ! "$field3" =~ ^project-.* ]] || [[ ! "$field4" =~ ^file-.* ]]; then
+            echo "ERROR: Invalid project or file ID format at line $line_number"
+            exit 1
+        fi
+
+        project_files+=("${field3}:${field4}")
+done < "$input_file"
+
+echo "Downloading files"
+echo "${project_files[@]}" | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} dx download --no-progress "{}"
 
 # Index VCFs
 echo "Indexing VCFs"
-for vcf in ./*vcf.gz; do
-    bcftools index "$vcf";
-done
+echo *vcf.gz | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} bcftools index "{}"
 
 # Normalising VCFs
 mkdir norm
 echo "Normalising VCFs"
-for vcf in ./*vcf.gz; do
-    bcftools norm -m -any -f "${genome}" -Oz "$vcf" > norm/"$vcf";
+for vcf in *vcf.gz; do
+    bcftools norm -m -any -f "${genome}" -Oz "${vcf}" > "norm/${vcf}"
 done
 
 # Indexing normalised VCFs
 echo "Indexing normalised VCFs"
 cd norm || exit
-for vcf in ./*vcf.gz; do
-    bcftools index -f "$vcf";
-done
+echo *vcf.gz | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} bcftools index -f "{}"
 
 # Merging normalised VCFs
-echo "Merging normalised VCFs"
-command="bcftools merge --output-type v -m none --missing-to-ref"
-# Add the VCF files names to the command
-for vcf in ./*vcf.gz; do
-    command="${command} $vcf";
+echo "Collating normalised VCFs"
+vcf_files=()
+for vcf in *vcf.gz; do
+    vcf_files+=("$vcf")
 done
-command="${command} > ../merged.vcf"
-echo "${command}"
-eval "${command}"
+echo "Merging collated VCFs into ../merged.vcf"
+bcftools merge --output-type v -m none --missing-to-ref "${vcf_files[@]}" > ../merged.vcf
 
 # Bgzip and index merged VCF file
 echo "Bgzip and indexing merged file"
@@ -58,11 +90,17 @@ bcftools index merged.vcf.gz
 
 # Final merging and processing
 echo "Final processing"
-command="bcftools norm -m -any -f ${genome} -Ou merged.vcf.gz"
-command="${command} | bcftools +fill-tags --output-type v -o merge_tag.vcf -- -t AN,AC,NS,AF,MAF,AC_Hom,AC_Het,AC_Hemi"
-command="${command} ; bcftools sort merge_tag.vcf -Oz > final_merged_${job}.vcf.gz"
-command="${command} ; tabix -p vcf final_merged_${job}.vcf.gz"
-eval "$command"
+
+echo "Normalising merged VCF"
+bcftools norm -m -any -f "${genome}" -Ou merged.vcf.gz \
+    | bcftools +fill-tags --output-type v -o merge_tag.vcf -- -t AN,AC,NS,AF,MAF,AC_Hom,AC_Het,AC_Hemi
+
+echo "Sorting and compressing final VCF"
+bcftools sort merge_tag.vcf -Oz -o "final_merged_${job}.vcf.gz"
+
+echo "Indexing final VCF"
+tabix -p vcf "final_merged_${job}.vcf.gz"
+
 dx upload "final_merged_${job}.vcf.gz"
 dx upload "final_merged_${job}.vcf.gz.tbi"
 dx terminate "${job}"