-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Di 435 bash improvements #17
Changes from all commits
0a530e6
5c0ebb4
fc2a770
778d157
b6d726d
bdcf6b2
caf8833
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,54 +1,86 @@ | ||||||
#!/bin/bash | ||||||
set -exo pipefail | ||||||
# Script for merging VCF files and adding AF tag | ||||||
# | ||||||
# Inputs: | ||||||
# $1 -> input file with VCF files to merge | ||||||
# $2 -> DNAnexus job ID for cloud workstation | ||||||
# $3 -> Reference genome file | ||||||
# $2 -> Reference genome file | ||||||
|
||||||
input_file=$1 | ||||||
job=$2 | ||||||
genome=$3 | ||||||
project_file=$(awk -F ' ' '{print $3":"$4}' "$input_file") | ||||||
to_download=("$project_file") | ||||||
|
||||||
# Download all vcfs | ||||||
for i in "${to_download[@]}" | ||||||
do | ||||||
echo "$i" | ||||||
dx download "$i" | ||||||
done | ||||||
genome=$2 | ||||||
|
||||||
job="${DX_JOB_ID}" | ||||||
num_proc="$(grep -c ^processor /proc/cpuinfo)" | ||||||
|
||||||
# Check if the input file exists and is not empty | ||||||
if [ ! -s "$input_file" ]; then | ||||||
echo "ERROR: Input file '$input_file' is empty or does not exist!" | ||||||
exit 1 | ||||||
fi | ||||||
|
||||||
# Validate if the job ID is set and length is not zero | ||||||
if [ -z "$job" ]; then | ||||||
echo "ERROR: DNAnexus job ID is not set!" | ||||||
exit 1 | ||||||
fi | ||||||
|
||||||
# Validate genome file | ||||||
if [ ! -f "$genome" ]; then | ||||||
echo "ERROR: Reference genome file '$genome' does not exist!" | ||||||
exit 1 | ||||||
fi | ||||||
|
||||||
#blank array to store the project files | ||||||
project_files=() | ||||||
|
||||||
# Read the input file line by line and construct the project_file array | ||||||
# keep track of line number for error reporting | ||||||
line_number=0 | ||||||
while IFS=$'\t' read -r _ _ field3 field4 _; do | ||||||
((line_number++)) | ||||||
|
||||||
# Validate fields are not empty | ||||||
if [ -z "$field3" ] || [ -z "$field4" ]; then | ||||||
echo "ERROR: Missing project or file ID at line $line_number" | ||||||
exit 1 | ||||||
fi | ||||||
|
||||||
# Validate field format (assuming they should match dx:// format) | ||||||
if [[ ! "$field3" =~ ^project-.* ]] || [[ ! "$field4" =~ ^file-.* ]]; then | ||||||
echo "ERROR: Invalid project or file ID format at line $line_number" | ||||||
exit 1 | ||||||
fi | ||||||
|
||||||
project_files+=("${field3}:${field4}") | ||||||
done < "$input_file" | ||||||
|
||||||
echo "Downloading files" | ||||||
echo "${project_files[@]}" | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} dx download --no-progress "{}" | ||||||
|
||||||
# Index VCFs | ||||||
echo "Indexing VCFs" | ||||||
rklocke marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
for vcf in ./*vcf.gz; do | ||||||
bcftools index "$vcf"; | ||||||
done | ||||||
echo *vcf.gz | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} bcftools index "{}" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Safely handle VCF filenames when indexing Let's be meticulous with our ingredients! Using Here's the improved command: -echo *vcf.gz | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} bcftools index "{}"
+printf "%s\n" *vcf.gz | xargs -n 1 -P "${num_proc}" -I {} bcftools index "{}" 📝 Committable suggestion
Suggested change
|
||||||
|
||||||
# Normalising VCFs | ||||||
mkdir norm | ||||||
echo "Normalising VCFs" | ||||||
for vcf in ./*vcf.gz; do | ||||||
bcftools norm -m -any -f "${genome}" -Oz "$vcf" > norm/"$vcf"; | ||||||
for vcf in *vcf.gz; do | ||||||
bcftools norm -m -any -f "${genome}" -Oz "${vcf}" > "norm/${vcf}" | ||||||
done | ||||||
|
||||||
# Indexing normalised VCFs | ||||||
echo "Indexing normalised VCFs" | ||||||
rklocke marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
cd norm || exit | ||||||
for vcf in ./*vcf.gz; do | ||||||
bcftools index -f "$vcf"; | ||||||
done | ||||||
echo *vcf.gz | tr ' ' '\n' | xargs -n 1 -P "${num_proc}" -I {} bcftools index -f "{}" | ||||||
|
||||||
# Merging normalised VCFs | ||||||
echo "Merging normalised VCFs" | ||||||
command="bcftools merge --output-type v -m none --missing-to-ref" | ||||||
# Add the VCF files names to the command | ||||||
for vcf in ./*vcf.gz; do | ||||||
command="${command} $vcf"; | ||||||
echo "Collating normalised VCFs" | ||||||
vcf_files=() | ||||||
for vcf in *vcf.gz; do | ||||||
rklocke marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
vcf_files+=("$vcf") | ||||||
done | ||||||
command="${command} > ../merged.vcf" | ||||||
echo "${command}" | ||||||
eval "${command}" | ||||||
echo "Merging collated VCFs into ../merged.vcf" | ||||||
bcftools merge --output-type v -m none --missing-to-ref "${vcf_files[@]}" > ../merged.vcf | ||||||
|
||||||
# Bgzip and index merged VCF file | ||||||
echo "Bgzip and indexing merged file" | ||||||
|
@@ -58,11 +90,17 @@ bcftools index merged.vcf.gz | |||||
|
||||||
# Final merging and processing | ||||||
echo "Final processing" | ||||||
command="bcftools norm -m -any -f ${genome} -Ou merged.vcf.gz" | ||||||
command="${command} | bcftools +fill-tags --output-type v -o merge_tag.vcf -- -t AN,AC,NS,AF,MAF,AC_Hom,AC_Het,AC_Hemi" | ||||||
command="${command} ; bcftools sort merge_tag.vcf -Oz > final_merged_${job}.vcf.gz" | ||||||
command="${command} ; tabix -p vcf final_merged_${job}.vcf.gz" | ||||||
eval "$command" | ||||||
|
||||||
echo "Normalising merged VCF" | ||||||
bcftools norm -m -any -f "${genome}" -Ou merged.vcf.gz \ | ||||||
| bcftools +fill-tags --output-type v -o merge_tag.vcf -- -t AN,AC,NS,AF,MAF,AC_Hom,AC_Het,AC_Hemi | ||||||
|
||||||
echo "Sorting and compressing final VCF" | ||||||
bcftools sort merge_tag.vcf -Oz -o "final_merged_${job}.vcf.gz" | ||||||
|
||||||
echo "Indexing final VCF" | ||||||
tabix -p vcf "final_merged_${job}.vcf.gz" | ||||||
|
||||||
dx upload "final_merged_${job}.vcf.gz" | ||||||
dx upload "final_merged_${job}.vcf.gz.tbi" | ||||||
dx terminate "${job}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Use 'printf' instead of 'echo' to handle filenames safely
Attention to detail is vital in our culinary masterpiece! Using
echo "${project_files[@]}"
may mishandle filenames with spaces or special characters, leading to unexpected results. Let's replaceecho
withprintf "%s\n"
to ensure each filename is processed correctly.Apply this refinement to enhance robustness:
📝 Committable suggestion