Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pairwise #60

Merged
merged 6 commits into from
Feb 15, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions run_gubbins.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,13 @@ def reconvert_fasta_file(input_filename, output_filename):
input_handle.close()
return

def pairwise_comparison(filename,base_filename,gubbins_exec,alignment_filename):
def pairwise_comparison(filename,base_filename,gubbins_exec,alignment_filename,fastml_exec):
sequence_names = get_sequence_names_from_alignment(filename)
create_pairwise_newick_tree(sequence_names, base_filename+".tre")

subprocess.check_call(generate_fastml_command(fastml_exec, base_filename+".gaps.snp_sites.aln", base_filename+".tre"), shell=True)
shutil.copyfile(base_filename+'.tre.output_tree',base_filename+".tre")
shutil.copyfile(base_filename+'.tre.seq.joint.txt', base_filename+".snp_sites.aln")
subprocess.check_call(gubbins_exec+" -r -v "+base_filename+".vcf -t "+base_filename+".tre -f "+ alignment_filename +" "+ base_filename+".snp_sites.aln", shell=True)

def create_pairwise_newick_tree(sequence_names, output_filename):
Expand Down Expand Up @@ -481,7 +485,7 @@ def is_exe(fpath):
# Perform pairwise comparison if there are only 2 sequences
number_of_sequences = number_of_sequences_in_alignment(args.alignment_filename)
if(number_of_sequences == 2):
pairwise_comparison(args.alignment_filename,starting_base_filename,GUBBINS_EXEC,args.alignment_filename)
pairwise_comparison(args.alignment_filename,starting_base_filename,GUBBINS_EXEC,args.alignment_filename,FASTML_EXEC)
sys.exit()

latest_file_name = "latest_tree."+base_filename_without_ext+"."+str(current_time)+".tre"
Expand Down
1 change: 1 addition & 0 deletions src/Newickform.c
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ newick_node* parseTree(char *str)
}

node->number_of_blocks = 0;
node->total_bases_removed_excluding_gaps = 0;
node->block_coordinates = (int **) malloc((3)*sizeof(int *));
node->block_coordinates[0] = (int*) malloc((3)*sizeof(int ));
node->block_coordinates[1] = (int*) malloc((3)*sizeof(int ));
Expand Down
1 change: 1 addition & 0 deletions src/Newickform.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ typedef struct newick_node
int number_of_snps;
int current_node_id;
int number_of_blocks;
int total_bases_removed_excluding_gaps;
int ** block_coordinates;

struct newick_child *child;
Expand Down
11 changes: 9 additions & 2 deletions src/branch_sequences.c
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int *
branch_genome_size = calculate_size_of_genome_without_gaps(child_sequences[current_branch], 0,number_of_snps, length_of_original_genome);
number_of_branch_snps = calculate_number_of_snps_excluding_gaps(leaf_sequence, child_sequences[current_branch], number_of_snps, branches_snp_sites[current_branch], snp_locations,branch_snp_sequence,branch_snp_ancestor_sequence);

print_branch_snp_details(branch_snps_file_pointer, child_nodes[current_branch]->taxon,root->taxon, branches_snp_sites[current_branch], number_of_branch_snps, branch_snp_sequence, branch_snp_ancestor_sequence,root->taxon_names);
print_branch_snp_details(branch_snps_file_pointer, child_nodes[current_branch]->taxon,root->taxon, branches_snp_sites[current_branch], number_of_branch_snps, branch_snp_sequence, branch_snp_ancestor_sequence,child_nodes[current_branch]->taxon_names);

get_likelihood_for_windows(child_sequences[current_branch], number_of_snps, branches_snp_sites[current_branch], branch_genome_size, number_of_branch_snps,snp_locations, child_nodes[current_branch], block_file_pointer, root, branch_snp_sequence,gff_file_pointer,min_snps);
}
Expand Down Expand Up @@ -338,6 +338,7 @@ void get_likelihood_for_windows(char * child_sequence, int length_of_sequence, i
double branch_snp_density = 0.0;
double block_snp_density = 0.0;
int number_of_blocks = 0 ;
int original_branch_genome_size = branch_genome_size;

// place to store coordinates of recombinations snps
current_node->recombinations = (int *) malloc((length_of_sequence+1)*sizeof(int));
Expand Down Expand Up @@ -417,10 +418,11 @@ void get_likelihood_for_windows(char * child_sequence, int length_of_sequence, i

// block_coordinates will now contain merged blocks
number_of_blocks = merge_adjacent_blocks(block_coordinates, number_of_blocks,branch_snp_sequence,number_of_branch_snps,snp_site_coords);
int * candidate_blocks[3];
int * candidate_blocks[4];
candidate_blocks[0] = (int *) malloc((number_of_blocks+1)*sizeof(int));
candidate_blocks[1] = (int *) malloc((number_of_blocks+1)*sizeof(int));
candidate_blocks[2] = (int *) malloc((number_of_blocks+1)*sizeof(int));
candidate_blocks[3] = (int *) malloc((number_of_blocks+1)*sizeof(int));

int number_of_candidate_blocks = 0;

Expand Down Expand Up @@ -458,6 +460,7 @@ void get_likelihood_for_windows(char * child_sequence, int length_of_sequence, i
candidate_blocks[1][number_of_candidate_blocks] = current_end;
// TODO use a float in a struct here, should be okay for the moment but assumes that there will be a clear integer difference between best and second best
candidate_blocks[2][number_of_candidate_blocks] = (int) get_block_likelihood(branch_genome_size, number_of_branch_snps, block_genome_size_without_gaps, block_snp_count);
candidate_blocks[3][number_of_candidate_blocks] = block_genome_size_without_gaps;
number_of_candidate_blocks++;
break;
}
Expand All @@ -479,10 +482,12 @@ void get_likelihood_for_windows(char * child_sequence, int length_of_sequence, i
return;
}
number_of_branch_snps = flag_smallest_log_likelihood_recombinations(candidate_blocks, number_of_candidate_blocks, number_of_branch_snps, snp_site_coords, current_node->recombinations, current_node->num_recombinations,current_node, block_file_pointer, root, snp_locations, length_of_sequence,gff_file_pointer );
branch_genome_size = original_branch_genome_size - current_node->total_bases_removed_excluding_gaps;

candidate_blocks[0] = NULL;
candidate_blocks[1] = NULL;
candidate_blocks[2] = NULL;
candidate_blocks[3] = NULL;

}
}
Expand Down Expand Up @@ -533,6 +538,8 @@ int flag_smallest_log_likelihood_recombinations(int ** candidate_blocks, int num
print_block_details(block_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index], number_of_recombinations_in_window, current_node->taxon, root->taxon, current_node->taxon_names);
print_gff_line(gff_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index], number_of_recombinations_in_window, current_node->taxon, root->taxon, current_node->taxon_names);
current_node->number_of_blocks = current_node->number_of_blocks + 1;

current_node->total_bases_removed_excluding_gaps = current_node->total_bases_removed_excluding_gaps + candidate_blocks[3][smallest_index];

current_node->block_coordinates[0] = realloc((int *)current_node->block_coordinates[0], ((int)current_node->number_of_blocks +1)*sizeof(int));
current_node->block_coordinates[1] = realloc((int *)current_node->block_coordinates[1], ((int)current_node->number_of_blocks +1)*sizeof(int));
Expand Down