Matrix changes and GATK Indel integration

alipirani88 · Dec 19, 2018 · a72aa46 · a72aa46
2 parents 053aea1 + 7b2c92a
commit a72aa46
Show file tree

Hide file tree

Showing 9 changed files with 234 additions and 2 deletions.
diff --git a/config_ali b/config_ali
@@ -14,8 +14,8 @@ binbase: /nfs/esnitkin/bin_group/variant_calling_bin/
 resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00
 large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00
 email: apirani@med.umich.edu
-queue: flux
-flux_account: esnitkin_flux
+queue: fluxod
+flux_account: esnitkin_fluxod
 notification: ae
 
 # Set Parameters for individual tools. Set the binbase of each tool: This should be the folder name of respective tools where the executables for each resp. tool resides.
@@ -1009,6 +1009,7 @@ Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/AB030/
 Ref_Name: Steno_K279a.fasta
 # path to the reference genome fasta file.
 Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Steno_K279a/
+<<<<<<< HEAD
 
 [lactobacillus_crispatus_ST1]
 # Name of reference genome fasta file.
@@ -1021,3 +1022,5 @@ Ref_Path: /nfs/esnitkin/Project_Crispatus/Sequence_data/Project_mother_daughter/
 Ref_Name: USA500-2395.fasta
 # path to the reference genome fasta file.
 Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/USA500-2395/
+=======
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
diff --git a/modules/beast/beast.py b/modules/beast/beast.py
@@ -6,7 +6,10 @@
 import argparse
 from subprocess import call
 import re
+<<<<<<< HEAD
 from fasta_functions import *
+=======
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 
 # Get command line arguments
 parser = argparse.ArgumentParser(description='Run BEAST pipeline.')
@@ -27,17 +30,43 @@
 tree = args.tree
 scripts_dir = args.scripts_dir
 
+<<<<<<< HEAD
+=======
+# Set paths
+python = '/nfs/esnitkin/bin_group/anaconda3/bin/python'
+mask_vars_path = '/nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/mask_gubbins_variants.py'
+
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 # Get invariant site counts
 if invar_sites is not None:
     if not invar_sites.endswith(tuple(['fa','fasta'])):
         # Path to file with invariant site counts
         invar_counts_file = invar_sites
     else:
+<<<<<<< HEAD
         # Count invariant sites in whole genome alignment
         invar_counts_file = count_invar_sites(invar_sites, gubbins_gff)
     with open(invar_counts_file) as f:
         invar_counts = f.readline().strip()
 
+=======
+        # Mask recombinant regions before counting invariant sites
+        if gubbins_gff is not None:
+            call([python, mask_vars_path, invar_sites, gubbins_gff])
+            aln_file = re.split('/|\.', invar_sites)[-2] + \
+                      '_gubbins_masked.fa'
+        else:
+            aln_file = invar_sites
+        # Count invariant sites in whole genome alignment
+        call([python, scripts_dir + '/get_num_invariant_sites_v2.py', aln_file])
+        invar_counts_file = re.split('/|\.', aln_file)[-2] + \
+                          '_invar_site_counts.txt'
+    # Read file with invariant site counts
+    with open(invar_counts_file) as f:
+        invar_counts = f.readline().strip()
+
+
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 # Final xmls for beast
 xmls_for_beast = []
 

diff --git a/modules/beast/beast_pbs_flux.pl b/modules/beast/beast_pbs_flux.pl
@@ -3,7 +3,10 @@
 #Takes as input:
 #		1) config_file	- A 1-column file where each line is the file name (minus the .xml extension) for a beast run
 # UPDATED 2017-09-21 from beast2/2.4.5 to beast2/2.4.7
+<<<<<<< HEAD
 # UPDATED 2018-11-19 from beast2/2.4.7 to /nfs/esnitkin/bin_group/beast_v2.5.1/beast/bin/beast
+=======
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 
 #PRINT USAGE
 if(@ARGV == 0)
@@ -75,13 +78,23 @@
 	print PBS "cd $out_dir";
 	print PBS "\n";
 	print PBS "#Load modules\n";
+<<<<<<< HEAD
 	#print PBS "module load beast2/2.4.7\n";
     print PBS "module load beagle\n";
     #print PBS "addonmanager -add BEAST_CLASSIC\n";
     #print PBS "addonmanager -add SCOTTI\n";
 	print PBS "\n";
 	print PBS "#  Put your job commands here:\n";
 	print PBS "/nfs/esnitkin/bin_group/beast_v2.5.1/beast/bin/beast -beagle_SSE -instances 12 -threads 12 beast$count\_$fnameorig\n";
+=======
+	print PBS "module load beast2/2.4.7\n";
+        print PBS "module load beagle\n";
+        #print PBS "addonmanager -add BEAST_CLASSIC\n";
+        #print PBS "addonmanager -add SCOTTI\n";
+	print PBS "\n";
+	print PBS "#  Put your job commands here:\n";
+	print PBS "beast -beagle_SSE -instances 12 -threads 12 beast$count\_$fnameorig\n";
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 	print PBS "\n";
 
 	close PBS;

diff --git a/modules/beast/fasta_functions.py b/modules/beast/fasta_functions.py
@@ -17,16 +17,25 @@
 import re
 import os
 
+<<<<<<< HEAD
 def subset_fasta(ids, fastain, fastaout):
     """Creates a new fasta file with a subset of original sequences.
 
     Args:
         idnames: List of sequence names to be extracted.
+=======
+def subset_fasta(idnames, fastain, fastaout):
+    """Creates a new fasta file with a subset of original sequences.
+
+    Args:
+        idnames: Text file with list of names of sequences to be extracted.
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
         fastain: Path to fasta file containing all sequences of interest.
         fastaout: Name of output fasta file containing only the sequences of interest.
 
     Output:
         Fasta file containing only the sequences of interest.
+<<<<<<< HEAD
     """    
 
     with open(fastaout, 'w') as f:
@@ -62,6 +71,28 @@ def get_fasta_subsets(csv, fasta):
             fas.append(fa)
 
     return(fas)
+=======
+
+    Returns:
+        Path to subsetted fasta file.
+    """    
+
+    ids = []
+
+    for line in open(idnames, 'r'):
+        if line.strip() == 'x':
+            pass
+        else:
+            ids.append(line.strip())
+
+    with open(fastaout, 'w') as f:
+        for rec in SeqIO.parse(fastain, 'fasta'):
+            if str(rec.id) in ids:
+                f.write('>' + names[ids.index(rec.id)] + '\n')
+                f.write(str(rec.seq) + '\n')
+
+    return(fastaout)
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 
 
 def rm_invar_sites(fasta, outfile=None, outfmt=['fasta','vcf'], outdir='.', 
@@ -82,7 +113,11 @@ def rm_invar_sites(fasta, outfile=None, outfmt=['fasta','vcf'], outdir='.',
         Path to output file.
     """
 
+<<<<<<< HEAD
     if len(outfmt) > 1:
+=======
+   if len(outfmt) > 1:
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
        outfmt = outfmt[0]
 
     # Get output fasta file name
@@ -237,13 +272,18 @@ def count_invar_sites(fasta,gff=None,outdir='.',path={'snpsites':'/nfs/esnitkin/
             if not li.startswith("#"):
                 positions.append(line.split('\t')[1])
 
+<<<<<<< HEAD
     # Get allele for invariant sites
     invar_sites = []
+=======
+    # Get ref allele for invariant sites
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
     for record in aln:
         seq_str = list(str(record.seq))
         for index in positions:
             index = int(index)
             seq_str[index] = ''
+<<<<<<< HEAD
         #tmp = ''.join(seq_str)
         if len(invar_sites) == 0:
             invar_sites = seq_str
@@ -260,16 +300,29 @@ def count_invar_sites(fasta,gff=None,outdir='.',path={'snpsites':'/nfs/esnitkin/
     # Get invariant site count for each base
     #print('Counting bases.')
     #invar_counts = Counter(invar_sites)
+=======
+        invar_sites = ''.join(seq_str)
+        break
+
+    # Get invariant site count for each base
+    print('Counting bases.')
+    invar_counts = Counter(invar_sites)
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 
     # Write base counts to files
     invar_counts_file = outdir + '/' + re.split('/|\.', aln_file)[-2] + \
                           '_invar_site_counts.txt'
     print('Writing ', invar_counts_file, ' (order: A C G T).', sep='')
     with open(invar_counts_file, 'w') as f:
         for base, count in sorted(invar_counts.items()):
+<<<<<<< HEAD
             #if base in ['A','a','C','c','G','g','T','t']:
             print(base + ' ' + str(count))
             f.write('%s ' % (count))
             #else:
             #    print(base + ' ' + str(count))
+=======
+            f.write('%s ' % (count))
+
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
     return(invar_counts_file)
diff --git a/modules/beast/scotti.py b/modules/beast/scotti.py
@@ -1,11 +1,16 @@
+<<<<<<< HEAD
 # PIPELINE TO RUN BEAST ON FLUX
+=======
+# PIPELINE TO RUN SCOTTI ON FLUX
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 
 # Import modules
 import sys
 import os
 import argparse
 from subprocess import call
 import re
+<<<<<<< HEAD
 from fasta_functions import *
 from scotti_functions import *
 
@@ -106,3 +111,28 @@
 print('Generating PBS script(s) and submitting jobs to flux.')
 call(['perl', scripts_dir + '/beast_pbs_flux.pl', 'input_beast.txt'])
 
+=======
+
+# Get command line arguments
+parser = argparse.ArgumentParser(description='Generate SCOTTI XMLs and run SCOTTI.')
+parser.add_argument('xmls', nargs='*', help='input SCOTTI BEAST xml file(s)')
+parser.add_argument('--nReps', '-n', metavar='N', type=int, help='number of beast runs for each xml', default=3)
+parser.add_argument('--invarSites', '-i', metavar='FILE', help='''
+                    option 1: fasta file to  get number of invariant As, Cs, Gs, and Ts not in alignment
+                    option 2: text file with number of invariant As, Cs, Gs, and Ts in alignment''')
+parser.add_argument('--gubbinsGFF', '-g', metavar='GFF', help='''gubbins output GFF file to mask recombinant sites in alignment before counting invariant sites''')
+parser.add_argument('--tree', '-t', help='starting tree file in newick format')
+parser.add_argument('-scriptsDir', '-d', metavar='DIR', help='directory where all of the scripts are housed', default='/nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/beast')
+args = parser.parse_args()
+# Rename variables
+xmls = args.xmls
+nReps = args.nReps
+invarSites = args.invarSites
+gubbinsGFF = args.gubbinsGFF
+tree = args.tree
+scriptsDir = args.scriptsDir
+
+# Set paths
+python = '/nfs/esnitkin/bin_group/anaconda3/bin/python'
+maskVarsPath = '/nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/mask_gubbins_variants.py'
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
diff --git a/modules/stages.py b/modules/stages.py
@@ -139,11 +139,40 @@ def qualimap(out_sorted_bam, out_path, analysis, logger, Config):
 """ Variant Filteration """
 def filter_variants(final_raw_vcf, out_path, analysis, ref_index, logger, Config, Avg_dp):
     reference = ConfigSectionMap(ref_index, Config)['ref_path'] + "/" + ConfigSectionMap(ref_index, Config)['ref_name']
+<<<<<<< HEAD
     gatk_filter_final_vcf_file = gatk_filter(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
     #gatk_filter_final_vcf_contamination_file = gatk_filter_contamination(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
     gatk_filter_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter_final_vcf_file, out_path, analysis, reference, logger, Config)
     keep_logging('The vcf file with no proximate snp: {}'.format(gatk_filter_final_vcf_file_no_proximate_snp), 'The vcf file with no proximate snp: {}'.format(gatk_filter_final_vcf_file_no_proximate_snp), logger, 'debug')
 
+=======
+    gatk_filter2_final_vcf_file = gatk_filter2(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
+    gatk_filter2_final_vcf_contamination_file = gatk_filter2_contamination(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
+    gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
+    keep_logging('The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), 'The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), logger, 'debug')
+    #gatk_vcf2fasta_filter2_file = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
+    #gatk_vcf2fasta_filter2_file_no_proximate = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
+    #vcftools_vcf2fasta_filter2_file = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
+    #vcftools_vcf2fasta_filter2_file_no_proximate = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
+    #keep_logging('The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), 'The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), logger, 'debug')
+    #keep_logging('The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
+    #keep_logging('The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), 'The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), logger, 'debug')
+    #keep_logging('The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
+
+def filter2_indels(final_raw_indel_vcf, out_path, analysis, ref_index, logger, Config, Avg_dp):
+    reference = ConfigSectionMap(ref_index, Config)['ref_path'] + "/" + ConfigSectionMap(ref_index, Config)['ref_name']
+    gatk_filter2_final_vcf_file = gatk_filter2_indel(final_raw_indel_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
+    # gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
+    # keep_logging('The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), 'The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), logger, 'debug')
+    # gatk_vcf2fasta_filter2_file = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
+    # gatk_vcf2fasta_filter2_file_no_proximate = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
+    # vcftools_vcf2fasta_filter2_file = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
+    # vcftools_vcf2fasta_filter2_file_no_proximate = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
+    # keep_logging('The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), 'The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), logger, 'debug')
+    # keep_logging('The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
+    # keep_logging('The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), 'The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), logger, 'debug')
+    # keep_logging('The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 
 def filter_indels(final_raw_indel_vcf, out_path, analysis, ref_index, logger, Config, Avg_dp):
     reference = ConfigSectionMap(ref_index, Config)['ref_path'] + "/" + ConfigSectionMap(ref_index, Config)['ref_name']

diff --git a/modules/variant_diagnostics/core_pipeline.py b/modules/variant_diagnostics/core_pipeline.py
@@ -3386,7 +3386,11 @@ def someOtherFunc(data, key):
         files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir)
         tabix(files_for_tabix, "vcf", logger, Config)
 
+<<<<<<< HEAD
         # # Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file.
+=======
+        # Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file.
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
         tmp_dir = "/tmp/temp_%s/" % log_unique_time
         create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir)
         create_indel_job(args.jobrun, vcf_filenames, unique_indel_position_file, tmp_dir)
@@ -3681,6 +3685,10 @@ def someOtherFunc(data, key):
         call("%s" % prepare_ref_var_consensus_input_cmd, logger)
         call("%s" % prepare_var_consensus_input_cmd, logger)
         call("%s" % prepare_allele_var_consensus_input_cmd, logger)
+<<<<<<< HEAD
+=======
+        #call("%s" % prepare_ref_allele_var_consensus_input_cmd, logger)
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
         call("%s" % prepare_ref_allele_unmapped_consensus_input_cmd, logger)
 
 
@@ -3713,9 +3721,15 @@ def someOtherFunc(data, key):
         # # Read new allele matrix and generate fasta; generate a seperate function
         keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info')
 
+<<<<<<< HEAD
         # create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file)
         #
         # extract_only_ref_variant_fasta_from_reference_allele_variant()
+=======
+        #create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file)
+
+        #extract_only_ref_variant_fasta_from_reference_allele_variant()
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
 
         call("cp %s %s/Logs/core/" % (
             log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger)

diff --git a/pipeline.py b/pipeline.py
@@ -481,7 +481,11 @@ def cleanup(args, logger):
     files_to_delete = []
     Config = ConfigParser.ConfigParser()
     Config.read(config_file)
+<<<<<<< HEAD
     #pipeline(args, logger)
+=======
+    pipeline(args, logger)
+>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
     cleanup(args, logger)
     keep_logging('End: Pipeline', 'End: Pipeline', logger, 'info')
     time_taken = datetime.now() - start_time_2