Skip to content

Commit

Permalink
Merge pull request #17 from eastgenomics/DI-435_bash_improvements
Browse files Browse the repository at this point in the history
Di 435 bash improvements
  • Loading branch information
rklocke authored Nov 26, 2024
2 parents db6c162 + caf8833 commit fcf4410
Showing 1 changed file with 72 additions and 34 deletions.
106 changes: 72 additions & 34 deletions DI-435/merge_VCF_AF.sh
Original file line number Diff line number Diff line change
@@ -1,54 +1,86 @@
#!/bin/bash
set -exo pipefail
# Script for merging VCF files and adding AF tag
#
# Inputs:
# $1 -> input file with VCF files to merge
# $2 -> DNAnexus job ID for cloud workstation
# $3 -> Reference genome file
# $2 -> Reference genome file

input_file=$1
job=$2
genome=$3
project_file=$(awk -F ' ' '{print $3":"$4}' "$input_file")
to_download=("$project_file")

# Download all vcfs
for i in "${to_download[@]}"
do
echo "$i"
dx download "$i"
done
genome=$2

job="${DX_JOB_ID}"
num_proc="$(grep -c ^processor /proc/cpuinfo)"

# Check if the input file exists and is not empty
if [ ! -s "$input_file" ]; then
echo "ERROR: Input file '$input_file' is empty or does not exist!"
exit 1
fi

# Validate if the job ID is set and length is not zero
if [ -z "$job" ]; then
echo "ERROR: DNAnexus job ID is not set!"
exit 1
fi

# Validate genome file
if [ ! -f "$genome" ]; then
echo "ERROR: Reference genome file '$genome' does not exist!"
exit 1
fi

#blank array to store the project files
project_files=()

# Read the input file line by line and construct the project_file array
# keep track of line number for error reporting
line_number=0
while IFS=$'\t' read -r _ _ field3 field4 _; do
((line_number++))

# Validate fields are not empty
if [ -z "$field3" ] || [ -z "$field4" ]; then
echo "ERROR: Missing project or file ID at line $line_number"
exit 1
fi

# Validate field format (assuming they should match dx:// format)
if [[ ! "$field3" =~ ^project-.* ]] || [[ ! "$field4" =~ ^file-.* ]]; then
echo "ERROR: Invalid project or file ID format at line $line_number"
exit 1
fi

project_files+=("${field3}:${field4}")
done < "$input_file"

echo "Downloading files"
echo "${project_files[@]}" | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} dx download --no-progress "{}"

# Index VCFs
echo "Indexing VCFs"
for vcf in ./*vcf.gz; do
bcftools index "$vcf";
done
echo *vcf.gz | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} bcftools index "{}"

# Normalising VCFs
mkdir norm
echo "Normalising VCFs"
for vcf in ./*vcf.gz; do
bcftools norm -m -any -f "${genome}" -Oz "$vcf" > norm/"$vcf";
for vcf in *vcf.gz; do
bcftools norm -m -any -f "${genome}" -Oz "${vcf}" > "norm/${vcf}"
done

# Indexing normalised VCFs
echo "Indexing normalised VCFs"
cd norm || exit
for vcf in ./*vcf.gz; do
bcftools index -f "$vcf";
done
echo *vcf.gz | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} bcftools index -f "{}"

# Merging normalised VCFs
echo "Merging normalised VCFs"
command="bcftools merge --output-type v -m none --missing-to-ref"
# Add the VCF files names to the command
for vcf in ./*vcf.gz; do
command="${command} $vcf";
echo "Collating normalised VCFs"
vcf_files=()
for vcf in *vcf.gz; do
vcf_files+=("$vcf")
done
command="${command} > ../merged.vcf"
echo "${command}"
eval "${command}"
echo "Merging collated VCFs into ../merged.vcf"
bcftools merge --output-type v -m none --missing-to-ref "${vcf_files[@]}" > ../merged.vcf

# Bgzip and index merged VCF file
echo "Bgzip and indexing merged file"
Expand All @@ -58,11 +90,17 @@ bcftools index merged.vcf.gz

# Final merging and processing
echo "Final processing"
command="bcftools norm -m -any -f ${genome} -Ou merged.vcf.gz"
command="${command} | bcftools +fill-tags --output-type v -o merge_tag.vcf -- -t AN,AC,NS,AF,MAF,AC_Hom,AC_Het,AC_Hemi"
command="${command} ; bcftools sort merge_tag.vcf -Oz > final_merged_${job}.vcf.gz"
command="${command} ; tabix -p vcf final_merged_${job}.vcf.gz"
eval "$command"

echo "Normalising merged VCF"
bcftools norm -m -any -f "${genome}" -Ou merged.vcf.gz \
| bcftools +fill-tags --output-type v -o merge_tag.vcf -- -t AN,AC,NS,AF,MAF,AC_Hom,AC_Het,AC_Hemi

echo "Sorting and compressing final VCF"
bcftools sort merge_tag.vcf -Oz -o "final_merged_${job}.vcf.gz"

echo "Indexing final VCF"
tabix -p vcf "final_merged_${job}.vcf.gz"

dx upload "final_merged_${job}.vcf.gz"
dx upload "final_merged_${job}.vcf.gz.tbi"
dx terminate "${job}"

0 comments on commit fcf4410

Please sign in to comment.