Matrix changes and GATK Indel integration

alipirani88 · Dec 19, 2018 · 053aea1 · 053aea1
1 parent e9bb7d2
commit 053aea1
Show file tree

Hide file tree

Showing 65 changed files with 5,021 additions and 3,192 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,18 @@
+modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked.fa
+modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_invar_site_counts.txt
+modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_snp-sites.vcf
+modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_var_sites.fa
+modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_invar_site_counts.txt
+modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_masked_recomb_positions.txt
+modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_snp-sites.vcf
+modules/beast/test/input_beast.txt
+modules/beast/test/invar_base_counts.txt
+modules/beast/test/model_finder_IQTREE_noDec_wDates_LA.tree
+modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta2_renamed.xml
+modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta2.xml
+modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta_renamed_st_invSites.xml
+modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta_renamed.xml
+modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta.xml
+modules/beast/test/test_commands.sh
+modules/beast/test/test_fasta_path.txt
+modules/beast/test/test_gff_path.txt
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/README.md b/README.md
diff --git a/barplot.png b/barplot.png
diff --git a/barplot_DP.png b/barplot_DP.png
diff --git a/config_ali b/config_ali
@@ -154,9 +154,31 @@ fq2: 0.025
 mq: 50
 # Filter variants with Variant QUAL less than the below threshold
 qual: 100
+# Filter variants with GATK QualbyDepth QD parameter; filter less than the below threshold. Currently, being used for Indel SNPS only.
+qd: 2.00
+# Filter variants with AF1 less than the below threshold
+af: 0.900
+# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
+prox: 0
+
+
+[gatk_haplotypecaller_filters]
+avg_depth: no
+# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
+# Filter variants with Depth less than the below threshold
+dp: 9
+# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
+low_depth: 2
+# A value of 5 means that regions with 5x depth greater than the average coverage will fail
+high_depth: 5
+# FQ not represented in GATK Haplotype caller vcf format. Instead use AF.
+# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
+mq: 50
+# Filter variants with Variant QUAL less than the below threshold
+qual: 2
 # Filter variants with AF1 less than the below threshold
 af: 0.9
-# Filter Variants that are proximate to each other
+# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
 prox: 0
 
 [rna_filters]
@@ -274,6 +296,12 @@ Ref_Name: cdiff_630.fasta
 # path to the reference genome fasta file.
 Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630/
 
+[CDIFF_630_ncbi]
+# Name of reference genome fasta file.
+Ref_Name: cdiff_630.fasta
+# path to the reference genome fasta file.
+Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630_ncbi/
+
 [Cdiff_VPI10463]
 # Name of reference genome fasta file.
 Ref_Name: Cdiff_VPI10463.fasta
@@ -974,3 +1002,22 @@ Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_Newman/
 Ref_Name: AB030.fasta
 # path to the reference genome fasta file.
 Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/AB030/
+
+
+[Steno_K279a]
+# Name of reference genome fasta file.
+Ref_Name: Steno_K279a.fasta
+# path to the reference genome fasta file.
+Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Steno_K279a/
+
+[lactobacillus_crispatus_ST1]
+# Name of reference genome fasta file.
+Ref_Name: lactobacillus_crispatus_ST1.fasta
+# path to the reference genome fasta file.
+Ref_Path: /nfs/esnitkin/Project_Crispatus/Sequence_data/Project_mother_daughter/reference_genome/lactobacillus_crispatus_ST1/
+
+[USA500-2395]
+# Name of reference genome fasta file.
+Ref_Name: USA500-2395.fasta
+# path to the reference genome fasta file.
+Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/USA500-2395/
diff --git a/config_gatk b/config_gatk
@@ -0,0 +1,237 @@
+## Set which tools to use in pipeline:
+[pipeline]
+# Options for Aligner:bwa / smalt / bowtie
+aligner: bwa
+# Options for variant_caller: samtoolswithpostalignbam / gatkhaplotypecaller /samtools
+variant_caller: samtools
+
+## Set bin folder path. Please make sure all the executables are placed in bin folder. Also make sure the path for individual tools are correct.
+[bin_path]
+binbase: /nfs/esnitkin/bin_group/variant_calling_bin/
+
+## Set PBS scheduler fields.
+[scheduler]
+# This small resource will be used for smaller jobs
+resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00
+# Large cluster resources in case of large sample size/variant call sets
+large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00
+email: apirani@med.umich.edu
+queue: flux
+flux_account: esnitkin_flux
+notification: ae
+
+## Tools/Module Parameters
+# Set Parameters for individual tools. Set the binbase of each tool: This should be the folder name of respective tools where the executables for each resp. tool resides.
+# Set parameter for Trimmomatic
+[Trimmomatic]
+trimmomatic_bin: /Trimmomatic/
+adaptor_filepath: adapters/TruSeq3-Nextera_PE_combined.fa
+seed_mismatches: 2
+palindrome_clipthreshold: 30
+simple_clipthreshold: 10
+minadapterlength: 8
+#change this to true and see the effect on alignment
+keep_both_reads: true
+window_size: 4
+window_size_quality: 20
+minlength: 40
+headcrop_length: 0
+colon: :
+targetlength: 125
+crop_length: 40
+f_p: forward_paired.fq.gz
+f_up: forward_unpaired.fq.gz
+r_p: reverse_paired.fq.gz
+r_up: reverse_unpaired.fq.gz
+
+[bwa]
+bwa_bin: /bwa-0.7.12/
+cores: 8
+base_cmd: bwa
+algorithm: mem
+index: index
+RG_header: -R
+Mark_splithits: -M
+
+[bowtie]
+bowtie_bin: /bowtie2-2.2.6/
+cores: 8
+build_cmd: bowtie2-build
+align_cmd: bowtie2
+parameters: -k 1 --non-deterministic --end-to-end
+
+[samtools]
+samtools_bin: /samtools-1.2/
+base_cmd: samtools
+#minimum mapping quality
+#change parameter S to -t SP and D to -t DP
+mpileup_parameters: -ug -f
+faiindex: faidx
+#-q30 -B -E -C50
+
+[bcftools]
+bcftools_bin: /bcftools-1.2/
+base_cmd: bcftools
+call_parameters: -vg
+
+[picard]
+picard_bin: /picard-tools-2.5.0/
+base_cmd: picard.jar
+
+[gatk]
+gatk_bin: /GenomeAnalysisTK-3.3-0/
+base_cmd: GenomeAnalysisTK.jar
+haplotype_parameters: -T HaplotypeCaller
+#gatk_filter1_parameter_expression: FQ < 40.00 && MQ > 20 && QUAL > 50 && DP > 15
+#gatk_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15
+#Deprecated. gatk_haplotypecaller Integration Pending for SNP calling.
+#gatk_haplotypecaller_specific_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15
+
+[vcftools]
+#vcftools_perl_bin: /vcftools_0.1.12b/perl/
+vcftools_bin: /vcftools_0.1.12b/bin/
+tabix_bin: /tabix-0.2.6/
+#vcftools_bin: /vcftools_0.1.12b/bin/
+vcftools_perl_bin: /vcftools_0.1.12b/perl/
+
+[qualimap]
+qualimap_bin: /qualimap_v2.1/
+base_cmd: qualimap
+
+[bedtools]
+bedtools_bin: /bedtools2-master/bin/
+base_cmd: bedtools
+version_for_coverage: /version_for_coverage/
+
+[bioawk]
+bioawk_bin: /bioawk-master/
+base_cmd: bioawk
+
+[fasttree]
+fasttree_bin: /Fasttree_2.1.10/
+#For Multithread fasttree; use FastTreeMP executable file
+base_cmd: FastTree
+
+[raxml]
+raxml_bin: /raxml/
+openmpi_bin: /openmpi/bin/
+# Other raxml executable available to use: raxmlHPC-PTHREADS,raxmlHPC-PTHREADS-SSE3,raxmlHPC-SSE3
+base_cmd: raxmlHPC-HYBRID-SSE3
+parameters: -f a -x 12345 -p 12345 -N autoMRE -m GTRCAT -T 20
+
+[iqtree]
+iqtree_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
+base_cmd: iqtree
+parameters: -nt AUTO -bb 1000 -m GTR+G+ASC
+
+[gubbins]
+# Change this path to wherever gubbins is located/installed. Right now, installed using conda from anaconda3 package installed in bin_group.
+gubbins_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
+base_cmd: run_gubbins.py
+
+[mummer]
+mummer_bin: /MUMmer3.23/
+nucmer_base_cmd: nucmer
+min_tandem_repeat_length: 20
+percent_id: 95
+
+## Variant Filters
+# Select which type of filters to use. Default is snitkin_filters. See Below snitkin_filters for more information about each filter criterias.
+[SNP_filters]
+filter_criteria: snitkin_filters
+# Other types of filters: SPANDx_filters, loose_filters, snitkin_filters, contamination_filters
+
+## Filters used for most of the Microbial Variant calling Projects.
+[snitkin_filters]
+avg_depth: no
+# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
+# Filter variants with Depth less than the below threshold
+dp: 9
+# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
+low_depth: 2
+# A value of 5 means that regions with 5x depth greater than the average coverage will fail
+high_depth: 5
+# Filter variants with FQ(Consensus Quality) greater than the below threshold
+fq: 0.025
+fq2: 0.025
+# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
+mq: 50
+# Filter variants with Variant QUAL less than the below threshold
+qual: 100
+# Filter variants with GATK QualbyDepth QD parameter; filter less than the below threshold. Currently, being used for Indel SNPS only.
+qd: 2.00
+# Filter variants with AF1 less than the below threshold
+af: 0.900
+# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
+prox: 0
+
+[contamination_filters]
+avg_depth: no
+# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
+# Filter variants with Depth less than the below threshold
+dp: 3
+# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
+low_depth: 2
+# A value of 5 means that regions with 5x depth greater than the average coverage will fail
+high_depth: 5
+# Filter variants with FQ(Consensus Quality) greater than the below threshold
+fq: -20.00
+fq2: -20.00
+# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
+mq: 50
+# Filter variants with Variant QUAL less than the below threshold
+qual: 0
+# Filter variants with AF1 less than the below threshold
+af: 1
+# Filter Variants that are proximate to each other
+prox: 1
+
+## Functional class filters. Find Phage and repetitive regions in reference genome, Mask regions of reference genome and dont call variants against it, Mask features with mobile element annotations in it. 
+[functional_filters]
+# If apply_functional_filters is set to no, the pipeline will not find Phage/Repeat regions and Mask positions provided in mask_file.
+apply_functional_filters: yes
+find_phage_region: yes
+find_repetitive_region: yes
+mask_region: no
+mask_file: /nfs/esnitkin/bin_group/variant_calling_bin/reference/test_file.txt
+mobile_elements: yes
+# If apply_to_calls is set to no, PHAGE/REPEAT/MASK will still run but will not remove calls falling in this region.
+apply_to_calls: yes
+
+##SNP annotations
+[snpeff]
+snpeff_bin: /snpEff/
+base_cmd: snpEff.jar
+snpeff_parameters: -d -no-downstream -no-upstream
+prebuild: no
+db: Staphylococcus_aureus_subsp_aureus_usa300_tch1516
+dataDir: /data/
+
+#CLUSTER_SNP: 3
+#CLUSTER_WINDOW_SNP: 10
+#MLEAF_SNP: 0.95
+#QD_SNP: 10.0
+#FS_SNP: 10.0
+#HAPLO_SNP: 20.0
+
+########################################################################################################################
+
+# Reference Genome to be used for pipeline
+# Set path for already indexed reference genome
+
+# Most Frequently used reference genomes
+
+# Name of the reference genome. Provide this value with -index argument.
+[KPNIH1]
+# Name of reference genome fasta file.
+Ref_Name: KPNIH1.fasta
+# path to the reference genome fasta file.
+Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/KPNIH1_test/
+
+
+[CFT073]
+# Name of reference genome fasta file.
+Ref_Name: EscherichiacoliCFT073.fna
+# path to the reference genome fasta file.
+Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CFT073
+