From dca60137acce4234316f668de53bd39a710c9aae Mon Sep 17 00:00:00 2001 From: luis Date: Mon, 29 Oct 2018 16:20:01 +0100 Subject: [PATCH 1/5] fixing bug. Not detecting PLOT when alignment sequence is at start of the contig or at the end --- taranis.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/taranis.py b/taranis.py index 3d0f7fa..b30177b 100755 --- a/taranis.py +++ b/taranis.py @@ -522,7 +522,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, samples_inferred = [] #allele_list_per_sample = [] for sample_file in sample_dict_files: - #print('sample file is: ', sample_file) + print('sample file is: ', sample_file) #with open (sample_file,'rb') as sample_f : # sample_dict = pickle.load(sample_f) #logger.debug('loaded in memory the sample file %s' , sample_file) @@ -726,22 +726,33 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, break allele_sequence = allele_item.seq - + # Retrieve the contig file for getting the contig sequence for the id found in Blast + contig_file = os.path.join(inputdir,str(sample_value + '.fasta')) + records = list (SeqIO.parse(contig_file, "fasta")) + #accession_sequence = records[accession] + for record in records: + if record.id == sseqid : + break + accession_sequence = record.seq #if int(s_length) < min(schema_variability[core_name]) : if int(s_length) < int(query_length) : - ## check if the blast alignment could be clasified as PLOT - seq_id_split = sseqid.split('_') + ## check if the blast alignment could be classified as PLOT + #seq_id_split = sseqid.split('_') #length_sseqid = seq_id_split[3] - length_sseqid = len(sseq) - if sstart == length_sseqid or send == length_sseqid: - samples_matrix_dict[sample_value].append('PLOT') + length_sseqid = len(accession_sequence) + if int(sstart) == length_sseqid or int(send) == length_sseqid or int(sstart) == 1 or int(send) == 1: + samples_matrix_dict[sample_value].append('PLOT') logger.info('PLOT found at sample %s, for gene %s', sample_value, core_name) if sample_value not in plot_dict : plot_dict[sample_value] = {} if not core_name in plot_dict[sample_value] : plot_dict[sample_value][core_name] = [] plot_dict[sample_value][core_name].append([qseqid,sseqid,bitscore,sstart, send, sseq]) + if int(sstart) == length_sseqid or int(send) == length_sseqid: + print( 'PLOT Final') + else: + print ('PLOT inicio') continue else: # print ('There is a deletion of ', gapopen,'gaps', 'or shorter mapping') @@ -755,11 +766,12 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, contig_file = os.path.join(inputdir,str(sample_value + '.fasta')) records = list (SeqIO.parse(contig_file, "fasta")) #accession_sequence = records[accession] + ''' for record in records: if record.id == sseqid : break accession_sequence = record.seq - + ''' if allele_sequence.endswith ('TGA') or allele_sequence.startswith ('TCA') : tga_stop_codon = True else: From 491a4f7f1d99bc4da8978aa1163a04eebaea2b2c Mon Sep 17 00:00:00 2001 From: luis Date: Mon, 29 Oct 2018 16:46:48 +0100 Subject: [PATCH 2/5] removing debuging print --- taranis.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/taranis.py b/taranis.py index b30177b..d0cb90a 100755 --- a/taranis.py +++ b/taranis.py @@ -522,7 +522,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, samples_inferred = [] #allele_list_per_sample = [] for sample_file in sample_dict_files: - print('sample file is: ', sample_file) + #print('sample file is: ', sample_file) #with open (sample_file,'rb') as sample_f : # sample_dict = pickle.load(sample_f) #logger.debug('loaded in memory the sample file %s' , sample_file) @@ -749,10 +749,6 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if not core_name in plot_dict[sample_value] : plot_dict[sample_value][core_name] = [] plot_dict[sample_value][core_name].append([qseqid,sseqid,bitscore,sstart, send, sseq]) - if int(sstart) == length_sseqid or int(send) == length_sseqid: - print( 'PLOT Final') - else: - print ('PLOT inicio') continue else: # print ('There is a deletion of ', gapopen,'gaps', 'or shorter mapping') @@ -815,6 +811,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, index_delete = deletions_dict[core_name].index(new_sseq) if new_sequence_lenght < query_length : delete_allele = 'ASM_DELETE_' + core_name + '_' + str(index_delete) + print(delete_allele) elif new_sequence_lenght == query_length: delete_allele = 'AEM_DELETE_' + core_name + '_' + str(index_delete) else: From 863c6731756aaf4841038c4c2a553a3c62cda9a9 Mon Sep 17 00:00:00 2001 From: luis Date: Mon, 29 Oct 2018 16:51:25 +0100 Subject: [PATCH 3/5] removing one last debuging print --- taranis.py | 1 - 1 file changed, 1 deletion(-) diff --git a/taranis.py b/taranis.py index d0cb90a..aa1be2f 100755 --- a/taranis.py +++ b/taranis.py @@ -811,7 +811,6 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, index_delete = deletions_dict[core_name].index(new_sseq) if new_sequence_lenght < query_length : delete_allele = 'ASM_DELETE_' + core_name + '_' + str(index_delete) - print(delete_allele) elif new_sequence_lenght == query_length: delete_allele = 'AEM_DELETE_' + core_name + '_' + str(index_delete) else: From b850d56b44a5bcda638c3277de4be33f0e1e7f88 Mon Sep 17 00:00:00 2001 From: luis Date: Tue, 30 Oct 2018 14:39:18 +0100 Subject: [PATCH 4/5] fixing bug when stop codon was not found --- taranis.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/taranis.py b/taranis.py index aa1be2f..8fe1291 100755 --- a/taranis.py +++ b/taranis.py @@ -425,10 +425,10 @@ def get_aligments_for_deletions (sample_seq, query_seq): def create_summary (samples_matrix_dict, logger) : summary_dict = {} summary_result_list = [] - summary_heading_list = ['Exact match', 'INF', 'ASM_INSERT', 'ASM_DELETE','ALM_INSERT' ,'ALM_DELETE', 'LNF','NIPH','NIPHEM','PLOT'] + summary_heading_list = ['Exact match', 'INF', 'ASM_INSERT', 'ASM_DELETE','ALM_INSERT' ,'ALM_DELETE', 'LNF','NIPH','NIPHEM','PLOT','ERROR'] summary_result_list.append('File\t' + '\t'.join(summary_heading_list)) for key in sorted (samples_matrix_dict) : - summary_dict[key] = {'Exact match':0, 'INF':0, 'ASM_INSERT':0, 'ASM_DELETE':0, 'ALM_INSERT':0, 'ALM_DELETE':0, 'LNF':0, 'NIPH':0, 'NIPHEM':0, 'PLOT':0} + summary_dict[key] = {'Exact match':0, 'INF':0, 'ASM_INSERT':0, 'ASM_DELETE':0, 'ALM_INSERT':0, 'ALM_DELETE':0, 'LNF':0, 'NIPH':0, 'NIPHEM':0, 'PLOT':0, 'ERROR':0} for values in samples_matrix_dict[key] : if 'INF_' in values : summary_dict[key]['INF'] += 1 @@ -448,6 +448,8 @@ def create_summary (samples_matrix_dict, logger) : summary_dict[key]['NIPHEM'] += 1 elif 'PLOT' in values : summary_dict[key]['PLOT'] += 1 + elif 'ERROR' in values : + summary_dict[key]['ERROR'] += 1 else: try: number =int(values) @@ -469,7 +471,9 @@ def create_summary (samples_matrix_dict, logger) : return summary_result_list - +def loadingBar(count,total,size): + percent = float(count)/float(total)*100 + sys.stdout.write("\r" + str(int(count)).rjust(3,'0')+"/"+str(int(total)).rjust(3,'0') + ' [' + '='*int(percent/10)*size + ' '*(10-int(percent/10))*size + ']') def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, sample_dict_files, blast_db_directory, inputdir, outputdir, cpus , percentlength, schema_variability, logger ): full_gene_list = [] @@ -497,9 +501,12 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, header_snp = ['Sample Name','Core Gene', 'Position','Sequence Sample/Schema','Protein in Sample/Schema', 'Annotation Sample / Schema'] header_protein = ['Sample Name','Core Gene', 'Protein in ' , 'Protein sequence'] header_match_alignment = ['Sample Name','Core Gene','Alignment', 'Sequence'] - + + number_of_genes = len(core_gene_dict_files) + print('Allele calling starts') for core_file in core_gene_dict_files: - print ( 'Analyzing core file : ', core_file) + #loadingBar(count,total,size) + #print ( 'Analyzing core file : ', core_file) full_gene_list.append(os.path.basename(core_file)) logger.info('Processing core gene file %s ', core_file) core_name = os.path.basename(core_file) @@ -522,7 +529,6 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, samples_inferred = [] #allele_list_per_sample = [] for sample_file in sample_dict_files: - #print('sample file is: ', sample_file) #with open (sample_file,'rb') as sample_f : # sample_dict = pickle.load(sample_f) #logger.debug('loaded in memory the sample file %s' , sample_file) @@ -780,7 +786,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, sample_gene_sequence = accession_sequence[int(sstart) - 51 : int(send) ] sample_gene_sequence = sample_gene_sequence.reverse_complement() else: - sample_gene_sequence = accession_sequence[int(send) -1 : int(sstart) + 51] + sample_gene_sequence = accession_sequence[int(send) -1 : int(sstart) + 51] else: if int(sstart) > int (send): sample_gene_sequence = accession_sequence[int(send) - 51 : int(sstart) ] @@ -862,6 +868,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, protein_dict[core_name][sample_value] = nucleotide_to_protein_aligment(new_sseq, qseq ) else: logger.error('ERROR : Stop codon was not found for the core %s and the sample %s', core_name, sample_value) + samples_matrix_dict[sample_value].append('ERROR not stop codon when deletion') #if int(s_length) > int(query_length) : elif int(s_length) > max(schema_variability[core_name]) : @@ -943,10 +950,10 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, else: - samples_matrix_dict[sample_value].append('ERROR ') + samples_matrix_dict[sample_value].append('ERROR not stop codon when insertion') print ('ERROR when looking the allele match for core gene ', core_name, 'at sample ', sample_value ) - + ''' logger.debug ('matching genes = %s', matching_genes_dict) logger.debug ('---------------------------------------------------') logger.debug ('sample matrix = %s', samples_matrix_dict) @@ -967,7 +974,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, logger.debug ('---------------------------------------------------') logger.debug ('list of proteins = %s' , protein_dict) logger.debug ('---------------------------------------------------') - + ''' From 644e0f7b8df22e9ed310d3bea9db39534bb98e67 Mon Sep 17 00:00:00 2001 From: luis Date: Wed, 31 Oct 2018 00:21:48 +0100 Subject: [PATCH 5/5] Adding progress bar. fixing bug when Insertion --- analyze_schema.py | 1 + taranis.py | 147 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 104 insertions(+), 44 deletions(-) diff --git a/analyze_schema.py b/analyze_schema.py index 8817336..47c05a4 100644 --- a/analyze_schema.py +++ b/analyze_schema.py @@ -32,6 +32,7 @@ def check_arg(args=None): evaluate_parser = subparser.add_parser('evaluate', help = 'Evaluate the schema ') evaluate_parser.add_argument('-input_dir', help = 'Directory where are the schema files.') + evaluate_parser.add_argument('-alt', required = False, help = 'Set to Yes if alternative start codon should be considered. Set to No to accept only ATG start codon', default = False) compare_parser = subparser.add_parser('compare', help = 'Compare 2 schema') compare_parser.add_argument('-scheme1', help = 'Directory where are the schema files for the schema 1') diff --git a/taranis.py b/taranis.py index 8fe1291..8e0ac27 100755 --- a/taranis.py +++ b/taranis.py @@ -29,16 +29,20 @@ import subprocess #from subprocess import check_output import shutil +from progressbar import ProgressBar def open_log(log_name): working_dir = os.getcwd() log_name=os.path.join(working_dir, log_name) #def create_log (): logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) + #logger.setLevel(logging.DEBUG) + logger.setLevel(logging.INFO) + #create the file handler handler = logging.handlers.RotatingFileHandler(log_name, maxBytes=200000, backupCount=5) - handler.setLevel(logging.DEBUG) + #handler.setLevel(logging.DEBUG) + handler.setLevel(logging.INFO) #create a Logging format formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') @@ -323,7 +327,7 @@ def check_sequence_order(allele_sequence, logger) : if allele_sequence[len(allele_sequence) -3: len(allele_sequence)] in start_codon_reverse : return 'reverse' return False - +''' def get_snp_2(sample, query) : snp_list = [] for i in range(len(sample)): @@ -333,8 +337,9 @@ def get_snp_2(sample, query) : except: snp_list.append([str(i+1), '-', sample[i]]) return snp_list - -def get_snp (sample, query) : +''' +''' +def get_snp_3 (sample, query) : prot_annotation = {'S': 'polar' ,'T': 'polar' ,'Y': 'polar' ,'Q': 'polar' ,'N': 'polar' ,'C': 'polar' ,'S': 'polar' , 'F': 'nonpolar' ,'L': 'nonpolar','I': 'nonpolar','M': 'nonpolar','P': 'nonpolar','V': 'nonpolar','A': 'nonpolar','W': 'nonpolar','G': 'nonpolar', 'D' : 'acidic', 'E' :'acidic', @@ -354,7 +359,7 @@ def get_snp (sample, query) : seq_sample = Seq.Seq(sample) seq_query = Seq.Seq(query) - + for index in range (0, length -2, 3) : codon_seq = seq_sample[index : index + 3] codon_que = seq_query[index : index + 3] @@ -368,12 +373,55 @@ def get_snp (sample, query) : else: prot_que = '-' snp_list.append([str(index+1),str(codon_seq) + '/'+ str(codon_que), prot_seq + '/' + prot_que, prot_annotation[prot_seq] + ' / ' + prot_annotation[prot_que]]) + + return snp_list +''' +def get_snp (sample, query) : + prot_annotation = {'S': 'polar' ,'T': 'polar' ,'Y': 'polar' ,'Q': 'polar' ,'N': 'polar' ,'C': 'polar' ,'S': 'polar' , + 'F': 'nonpolar' ,'L': 'nonpolar','I': 'nonpolar','M': 'nonpolar','P': 'nonpolar','V': 'nonpolar','A': 'nonpolar','W': 'nonpolar','G': 'nonpolar', + 'D' : 'acidic', 'E' :'acidic', + 'H': 'basic' , 'K': 'basic' , 'R' : 'basic', + '-': '-----', '*' : 'Stop codon'} - - - + snp_list = [] + length = max(len(sample), len(query)) + # normalize the lenght of the sample for the iteration + if len(sample) < length : + need_to_add = length - len(sample) + sample = sample + need_to_add * '-' + if len(query) < length : + need_to_add = length - len(query) + query = query + need_to_add * '-' + # convert to Seq class to translate to protein + seq_sample = Seq.Seq(sample) + seq_query = Seq.Seq(query) + + for index in range(length): + if seq_query[index] != seq_sample[index] : + triple_index = index - (index % 3) + codon_seq = seq_sample[triple_index : triple_index + 3] + codon_que = seq_query[triple_index : triple_index + 3] + if not '-' in str(codon_seq) : + prot_seq = str(codon_seq.translate()) + else: + prot_seq = '-' + prot_que = str(seq_query[triple_index: ].translate()) + if not '-' in str(codon_que) : + prot_que = str(codon_que.translate()) + else: + prot_que = '-' + prot_seq = str(seq_sample[triple_index: ].translate()) + snp_list.append([str(index+1),str(seq_sample[index]) + '/' + str(seq_query[index]), str(codon_seq) + '/'+ str(codon_que), + # when one of the sequence ends but not the other we will translate the remain sequence to proteins + # in that case we will only annotate the first protein. Using [0] as key of the dictionary annotation + prot_seq + '/' + prot_que, prot_annotation[prot_seq[0]] + ' / ' + prot_annotation[prot_que[0]]]) + if '-' in str(codon_seq) or '-' in str(codon_que) : + break + + return snp_list + def convert_to_protein (sequence) : seq = Seq.Seq(sequence) @@ -428,7 +476,7 @@ def create_summary (samples_matrix_dict, logger) : summary_heading_list = ['Exact match', 'INF', 'ASM_INSERT', 'ASM_DELETE','ALM_INSERT' ,'ALM_DELETE', 'LNF','NIPH','NIPHEM','PLOT','ERROR'] summary_result_list.append('File\t' + '\t'.join(summary_heading_list)) for key in sorted (samples_matrix_dict) : - summary_dict[key] = {'Exact match':0, 'INF':0, 'ASM_INSERT':0, 'ASM_DELETE':0, 'ALM_INSERT':0, 'ALM_DELETE':0, 'LNF':0, 'NIPH':0, 'NIPHEM':0, 'PLOT':0, 'ERROR':0} + summary_dict[key] = {'Exact match':0, 'INF':0, 'ASM_INSERT':0, 'ASM_DELETE':0, 'AEM_INSERT' :0, 'AEM_DELETE':0, 'ALM_INSERT':0, 'ALM_DELETE':0, 'LNF':0, 'NIPH':0, 'NIPHEM':0, 'PLOT':0, 'ERROR':0} for values in samples_matrix_dict[key] : if 'INF_' in values : summary_dict[key]['INF'] += 1 @@ -436,10 +484,14 @@ def create_summary (samples_matrix_dict, logger) : summary_dict[key]['ASM_INSERT'] += 1 elif 'ASM_DELETE' in values : summary_dict[key]['ASM_DELETE'] += 1 + elif 'AEM_DELETE' in values : + summary_dict[key]['AEM_DELETE'] += 1 elif 'ALM_INSERT' in values : summary_dict[key]['ALM_INSERT'] += 1 elif 'ALM_DELETE' in values : summary_dict[key]['ALM_DELETE'] += 1 + elif 'AEM_INSERT' in values : + summary_dict[key]['AEM_INSERT'] += 1 elif 'LNF' in values : summary_dict[key]['LNF'] += 1 elif 'NIPH' in values : @@ -471,10 +523,6 @@ def create_summary (samples_matrix_dict, logger) : return summary_result_list -def loadingBar(count,total,size): - percent = float(count)/float(total)*100 - sys.stdout.write("\r" + str(int(count)).rjust(3,'0')+"/"+str(int(total)).rjust(3,'0') + ' [' + '='*int(percent/10)*size + ' '*(10-int(percent/10))*size + ']') - def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, sample_dict_files, blast_db_directory, inputdir, outputdir, cpus , percentlength, schema_variability, logger ): full_gene_list = [] samples_matrix_dict = {} # to keep allele number @@ -494,19 +542,20 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"' header_macthing_alleles_conting = ['Sample Name', 'Contig', 'Core Gene','start', 'stop', 'direction', 'codification'] header_paralogs = ['Sample Name','Core Gene', 'Allele','Contig','Bit Score', 'Start Seq', 'End Seq','Sequence'] - header_inferred = ['Sample Name','Core Gene', 'Allele','Contig','Bit Score', 'Start Seq', 'End Seq','Sequence'] + header_inferred = ['Sample Name','Core Gene', 'Inferred Allele list'] header_insertions = [ 'Core Gene', 'Sample Name' , 'Insertion item' ,'Allele', 'Contig', 'Bitscore', 'Query length' , 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence' ] header_deletions = [ 'Core Gene', 'Sample Name' , 'Deletion item' ,'Allele', 'Contig', 'Bitscore', 'Query length' , 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence' ] header_plot = ['Core Gene', 'Sample Name' , 'Allele','Contig','Bit Score', 'Start Seq', 'End Seq','Sequence'] - header_snp = ['Sample Name','Core Gene', 'Position','Sequence Sample/Schema','Protein in Sample/Schema', 'Annotation Sample / Schema'] + header_snp = ['Core Gene', 'Sample Name', 'Position', 'Mutation Sample/Schema', 'Codon Sample/Schema','Protein in Sample/Schema', 'Annotation Sample / Schema'] header_protein = ['Sample Name','Core Gene', 'Protein in ' , 'Protein sequence'] header_match_alignment = ['Sample Name','Core Gene','Alignment', 'Sequence'] number_of_genes = len(core_gene_dict_files) print('Allele calling starts') - for core_file in core_gene_dict_files: - #loadingBar(count,total,size) - #print ( 'Analyzing core file : ', core_file) + pbar = ProgressBar () + for core_file in pbar(core_gene_dict_files) : + #for core_file in core_gene_dict_files: + full_gene_list.append(os.path.basename(core_file)) logger.info('Processing core gene file %s ', core_file) core_name = os.path.basename(core_file) @@ -712,6 +761,14 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, samples_matrix_dict[sample_value].append(inferred_allele) inf_dict[sample_value][core_name].append([qseqid,sseqid,bitscore,sstart, send, sseq]) + # Get the SNP for the new allele inferred + if not core_name in snp_dict : + snp_dict[core_name] = {} + if not sample_value in snp_dict[core_name] : + snp_dict[core_name][sample_value] = [] + snp_dict[core_name][sample_value] = get_snp(sseq, qseq) + + if not sseqid in matching_genes_dict[sample_value] : matching_genes_dict[sample_value][sseqid] = [] if sstart > send : @@ -805,8 +862,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, # sample_gene_sequence = accession_sequence[int(send) -1: int(sstart) + bassed_added_end + 51] stop_index = get_stop_codon_index(sample_gene_sequence, tga_stop_codon, int(qlen)- int(qstart)) if stop_index != False: - new_sequence_lenght = stop_index +3 - new_sseq = str(sample_gene_sequence[0:new_sequence_lenght]) + new_sequence_length = stop_index +3 + new_sseq = str(sample_gene_sequence[0:new_sequence_length]) ### adding ASM allele to the asm_allele_matrix if it is not already include if not core_name in deletions_dict : @@ -815,9 +872,9 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, deletions_dict[core_name].append(new_sseq) ### find the index of ASM to include it in the sample matrix dict index_delete = deletions_dict[core_name].index(new_sseq) - if new_sequence_lenght < query_length : + if new_sequence_length < query_length : delete_allele = 'ASM_DELETE_' + core_name + '_' + str(index_delete) - elif new_sequence_lenght == query_length: + elif new_sequence_length == query_length: delete_allele = 'AEM_DELETE_' + core_name + '_' + str(index_delete) else: delete_allele = 'ALM_DELETE_' + core_name + '_' + str(index_delete) @@ -826,27 +883,27 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if not sseqid in matching_genes_dict[sample_value] : matching_genes_dict[sample_value][sseqid] = [] if sstart > send : - matching_genes_dict[sample_value][sseqid].append([core_name, str(int(sstart)-new_sequence_lenght -1), sstart,'-', delete_allele]) + matching_genes_dict[sample_value][sseqid].append([core_name, str(int(sstart)-new_sequence_length -1), sstart,'-', delete_allele]) else: - matching_genes_dict[sample_value][sseqid].append([core_name, sstart,str(int(sstart)+ new_sequence_lenght),'+', delete_allele]) + matching_genes_dict[sample_value][sseqid].append([core_name, sstart,str(int(sstart)+ new_sequence_length),'+', delete_allele]) ### add the deletion into deletion list if not core_name in list_deletions : list_deletions [core_name] = {} if not sample_value in list_deletions[core_name] : list_deletions[core_name][sample_value] = {} - list_deletions[core_name][sample_value][delete_allele] = [qseqid, sseqid, bitscore, str(query_length) , s_length, str(new_sequence_lenght), mismatch , gapopen, sstart, send, new_sseq ] + list_deletions[core_name][sample_value][delete_allele] = [qseqid, sseqid, bitscore, str(query_length) , s_length, str(new_sequence_length), mismatch , gapopen, sstart, send, new_sseq ] if check_sequence_order(qseq, logger) == 'reverse' : qseq = str(allele_sequence.reverse_complement()) else: qseq = str(allele_sequence) # get the SNP for the delection - if not core_name in snp_dict : - snp_dict[core_name] = {} - if not sample_value in snp_dict[core_name] : - snp_dict[core_name][sample_value] = [] - snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) + #if not core_name in snp_dict : + # snp_dict[core_name] = {} + #if not sample_value in snp_dict[core_name] : + # snp_dict[core_name][sample_value] = [] + #snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) # execute again blast with the reference query the previous query found to get the aligment format to get the SNPs @@ -871,8 +928,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, samples_matrix_dict[sample_value].append('ERROR not stop codon when deletion') #if int(s_length) > int(query_length) : - elif int(s_length) > max(schema_variability[core_name]) : - #elif int(s_length) > int(query_length) : + #elif int(s_length) > max(schema_variability[core_name]) : + elif int(s_length) > int(query_length) : #print ('there is a insertion of ', gapopen ,' bases in the sequence') #print ('qlen is: ',qlen, ' seq_len is : ', length, 'query_reference_length is : ', query_length) #query_seq = Seq.Seq(qseq) @@ -881,9 +938,9 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, stop_index = get_stop_codon_index(sseq, tga_stop_codon, qseq.find('-')) if stop_index != False: - new_sequence_lenght = stop_index +3 + new_sequence_length = stop_index +3 ### adding ASM allele to the asm_allele_matrix if it is not already include - new_sseq = sseq[0:new_sequence_lenght] + new_sseq = sseq[0:new_sequence_length] if not core_name in insertions_dict : insertions_dict[core_name] = [] @@ -891,11 +948,13 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, insertions_dict[core_name].append(new_sseq) ### find the index of ASM to include it in the sample matrix dict index_insert = insertions_dict[core_name].index(new_sseq) - #if new_sequence_lenght < query_length : - if new_sequence_lenght < min(schema_variability[core_name]) : + #if new_sequence_length < query_length : + if new_sequence_length < min(schema_variability[core_name]) : insert_allele = 'ASM_INSERT_' + core_name + '_' + str(index_insert) - else: + elif new_sequence_length in schema_variability[core_name] : insert_allele = 'AEM_INSERT_' + core_name + '_' + str(index_insert) + else: + insert_allele = 'ALM_INSERT_' + core_name + '_' + str(index_insert) samples_matrix_dict[sample_value].append(insert_allele) else: samples_matrix_dict[sample_value].append('ALM_INSERT_') @@ -910,18 +969,18 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, list_insertions [core_name] = {} if not sample_value in list_insertions[core_name] : list_insertions[core_name][sample_value] = {} - list_insertions[core_name][sample_value][insert_allele] = [qseqid, sseqid, bitscore, str(query_length) , s_length, str(new_sequence_lenght), mismatch , gapopen, sstart, send, new_sseq ] + list_insertions[core_name][sample_value][insert_allele] = [qseqid, sseqid, bitscore, str(query_length) , s_length, str(new_sequence_length), mismatch , gapopen, sstart, send, new_sseq ] if check_sequence_order(qseq, logger) == 'reverse' : qseq = str(allele_sequence.reverse_complement()) else: qseq = str(allele_sequence) # get the SNP for the delection - if not core_name in snp_dict : - snp_dict[core_name] = {} - if not sample_value in snp_dict[core_name] : - snp_dict[core_name][sample_value] = [] - snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) + #if not core_name in snp_dict : + # snp_dict[core_name] = {} + #if not sample_value in snp_dict[core_name] : + # snp_dict[core_name][sample_value] = [] + #snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) ''' cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 80, outfmt= 5, max_target_seqs=10, max_hsps=10,num_threads=3)