Skip to content

Commit

Permalink
Merge pull request #305 from sanger-pathogens/masking_aln
Browse files Browse the repository at this point in the history
Version 3.1
  • Loading branch information
nickjcroucher authored Oct 6, 2021
2 parents 25184cf + b85e832 commit 3db99f5
Show file tree
Hide file tree
Showing 10 changed files with 545 additions and 78 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.0.0
3.1.0
38 changes: 22 additions & 16 deletions python/gubbins/ValidateFastaAlignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from Bio.Align import MultipleSeqAlignment

class ValidateFastaAlignment(object):

def __init__(self, input_filename):
self.input_filename = input_filename

Expand All @@ -22,9 +23,8 @@ def is_input_fasta_file_valid(self):
return False
except:
return False

return True

def does_each_sequence_have_a_name_and_genomic_data(self):
with open(self.input_filename, "r") as input_handle:
alignments = AlignIO.parse(input_handle, "fasta")
Expand All @@ -33,21 +33,19 @@ def does_each_sequence_have_a_name_and_genomic_data(self):
for record in alignment:
number_of_sequences +=1
if record.name is None or record.name == "":
print("Error with the input FASTA file: One of the sequence names is blank")
sys.stderr.write("Error with the input FASTA file: " + record.name + " is blank\n")
return False
if record.seq is None or record.seq == "":
print("Error with the input FASTA file: One of the sequences is empty")
sys.stderr.write("Error with the input FASTA file: " + record.name + " is empty\n")
return False
if re.search('[^ACGTNacgtn-]', str(record.seq)) != None:
print("Error with the input FASTA file: One of the sequences contains odd characters, only ACGTNacgtn- are permitted")
sys.stderr.write("Error with the input FASTA file: " + record.name + " contains disallowed characters, only ACGTNacgtn- are permitted\n")
return False
input_handle.close()
return True

def does_each_sequence_have_the_same_length(self):
try:
with open(self.input_filename) as input_handle:

alignments = AlignIO.parse(input_handle, "fasta")
sequence_length = -1
for alignment in alignments:
Expand All @@ -63,17 +61,25 @@ def does_each_sequence_have_the_same_length(self):
print("Error with the input FASTA file: It is in the wrong format so check its an alignment")
return False
return True

def are_sequence_names_unique(self):
with open(self.input_filename) as input_handle:
alignments = AlignIO.parse(input_handle, "fasta")
sequence_names = []
for alignment in alignments:
any_modified_names = False
with open(self.input_filename) as input_handle:
alignment = AlignIO.read(input_handle, "fasta")
sequence_names = []
for record in alignment:
# Remove disallowed characters
if '#' in record.name or '#' in record.name:
record.name = record.name.replace("#","_").replace(":","_")
record.id = record.id.replace("#", "_").replace(":", "_")
record.description = record.description.replace("#", "_").replace(":", "_")
any_modified_names = True
# Store modified names
sequence_names.append(record.name)

if [k for k,v in list(Counter(sequence_names).items()) if v>1] != []:
return False
input_handle.close()
return True
# Update alignment if names changed
with open(self.input_filename, "w") as output_handle:
AlignIO.write(alignment,output_handle, "fasta")
return True

142 changes: 115 additions & 27 deletions python/gubbins/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,9 @@ def parse_and_run(input_args, program_description=""):
# 3.5a. Joint ancestral reconstruction with new tree and info file in each iteration
info_filename = model_fitter.get_info_filename(temp_working_dir,current_basename)
recontree_filename = model_fitter.get_recontree_filename(temp_working_dir,current_basename)
# Arbitrary rooting of tree
reroot_tree_at_midpoint(recontree_filename)
# Set root of reconstruction tree to match that of the current tree
# Cannot just midpoint root both, because the branch lengths differ between them
harmonise_roots(recontree_filename, temp_rooted_tree)

printer.print(["\nRunning joint ancestral reconstruction with pyjar"])
jar(alignment = polymorphism_alignment, # complete polymorphism alignment
Expand All @@ -271,8 +272,12 @@ def parse_and_run(input_args, program_description=""):
verbose = input_args.verbose)
gaps_alignment_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.aln"
raw_internal_rooted_tree_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.tre"
transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree,
current_tree_name_with_internal_nodes, "pyjar")
printer.print(["\nTransferring pyjar results onto original recombination-corrected tree"])
transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename,
temp_rooted_tree,
current_tree_name_with_internal_nodes,
"pyjar")
printer.print(["\nDone transfer"])

else:

Expand Down Expand Up @@ -309,10 +314,12 @@ def parse_and_run(input_args, program_description=""):
current_tree_name_with_internal_nodes, sequence_reconstructor)
elif input_args.seq_recon == "iqtree" or input_args.seq_recon == "raxmlng":
# IQtree returns an unrooted tree
temp_unrooted_tree = temp_working_dir + "/" + current_tree_name + ".unrooted"
unroot_tree(temp_rooted_tree, temp_unrooted_tree)
transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_unrooted_tree,
current_tree_name_with_internal_nodes, sequence_reconstructor)
harmonise_roots(raw_internal_rooted_tree_filename, temp_rooted_tree)
transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename,
temp_rooted_tree,
current_tree_name_with_internal_nodes,
sequence_reconstructor,
use_root = False)
else:
sys.stderr.write("Unrecognised sequence reconstruction command: " + input_args.seq_recon + '\n')
sys.exit()
Expand Down Expand Up @@ -699,7 +706,6 @@ def reroot_tree_with_outgroup(tree_name, outgroups):
with open(tree_name, 'w+') as output_file:
output_file.write(output_tree_string.replace('\'', ''))


def reroot_tree_at_midpoint(tree_name):
tree = dendropy.Tree.get_from_path(tree_name, 'newick', preserve_underscores=True)
split_all_non_bi_nodes(tree.seed_node)
Expand All @@ -718,6 +724,50 @@ def unroot_tree(input_filename, output_filename):
with open(output_filename, 'w+') as output_file:
output_file.write(output_tree_string.replace('\'', ''))

def harmonise_roots(new_tree_fn, tree_for_root_fn):
# Read in tree and get nodes adjacent to root
taxa = dendropy.TaxonNamespace()
tree_for_root = dendropy.Tree.get_from_path(tree_for_root_fn,
'newick',
preserve_underscores=True,
taxon_namespace=taxa,
rooting="force-rooted")
tree_for_root.encode_bipartitions()
root = tree_for_root.seed_node
root_adjacent_bipartitions = []
for root_adjacent_node in root.child_nodes():
root_adjacent_bipartitions.append(root_adjacent_node.edge.bipartition)
# Now search the new tree for these nodes
new_tree = dendropy.Tree.get_from_path(new_tree_fn,
'newick',
preserve_underscores=True,
taxon_namespace=taxa,
rooting="force-rooted")
new_tree.encode_bipartitions()
for node in new_tree.preorder_node_iter():
if node.edge.bipartition in root_adjacent_bipartitions:
half_branch_length = node.edge_length/2
new_tree.reroot_at_edge(node.edge,
length1 = half_branch_length,
length2 = half_branch_length)
break
new_tree_string = tree_as_string(new_tree,
suppress_internal=False,
suppress_rooting=False)

# Check both trees are topologically identical
missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(tree_for_root,
new_tree,
is_bipartitions_updated=False)

if len(missing_bipartitions) > 0:
sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions]))
sys.exit(1)

# Write output
with open(new_tree_fn, 'w+') as output_file:
output_file.write(new_tree_string.replace('\'', ''))

def filter_out_removed_taxa_from_tree(input_filename, output_filename, taxa_removed):
tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True)
tree.prune_taxa_with_labels(taxa_removed, update_bipartitions=True)
Expand Down Expand Up @@ -791,32 +841,70 @@ def get_monophyletic_outgroup(tree_name, outgroups):


def transfer_internal_node_labels_to_tree(source_tree_filename, destination_tree_filename, output_tree_filename,
sequence_reconstructor):

sequence_reconstructor, use_root = True):
# read source tree and extract node labels, to match with the ancestral sequence reconstruction
source_tree = dendropy.Tree.get_from_path(source_tree_filename, 'newick', preserve_underscores=True)
source_internal_node_labels = []
taxa = dendropy.TaxonNamespace()
source_tree = dendropy.Tree.get_from_path(source_tree_filename,
'newick',
preserve_underscores=True,
taxon_namespace=taxa,
rooting='force-rooted')
source_tree.encode_bipartitions()
source_internal_node_dict = {}
for source_internal_node in source_tree.internal_nodes():
node_bipartition = source_internal_node.edge.bipartition
if source_internal_node.label:
source_internal_node_labels.append(source_internal_node.label)
source_internal_node_dict[node_bipartition] = source_internal_node.label
else:
source_internal_node_labels.append('')

source_internal_node_dict[node_bipartition] = ''
# read original tree and add in the labels from the ancestral sequence reconstruction
destination_tree = dendropy.Tree.get_from_path(destination_tree_filename, 'newick', preserve_underscores=True)
for index, destination_internal_node in enumerate(destination_tree.internal_nodes()):
if sequence_reconstructor == 'pyjar':
new_label = str(source_internal_node_labels[index])
else:
new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_labels[index]))
destination_internal_node.label = None
destination_internal_node.taxon = dendropy.Taxon(new_label)
destination_tree = dendropy.Tree.get_from_path(destination_tree_filename,
'newick',
preserve_underscores=True,
taxon_namespace=taxa,
rooting='force-rooted')
destination_tree.encode_bipartitions()

# Check both trees are topologically identical
missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(source_tree,
destination_tree,
is_bipartitions_updated=False)
if len(missing_bipartitions) > 0:
sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions]))
sys.exit(1)

root_alternative = ''
for destination_internal_node in destination_tree.internal_nodes():
if destination_internal_node != destination_tree.seed_node or use_root:
node_bipartition = destination_internal_node.edge.bipartition
if sequence_reconstructor == 'pyjar':
try:
new_label = source_internal_node_dict[node_bipartition]
except:
sys.stderr.write('Unable to find bipartition ' + str(node_bipartition) + '\n')
sys.exit(1)
else:
new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_dict[node_bipartition]))
destination_internal_node.label = None
destination_internal_node.taxon = dendropy.Taxon(new_label)

# output final tree
output_tree_string = tree_as_string(destination_tree, suppress_internal=False, suppress_rooting=False)
with open(output_tree_filename, 'w+') as output_file:
output_file.write(output_tree_string.replace('\'', ''))
if not use_root:
alt_root_name = ''
for root_child_node in destination_tree.seed_node.child_nodes():
alt_root_name = root_child_node.taxon.label
break
destination_tree.seed_node.label = None
destination_tree.seed_node.taxon = dendropy.Taxon(alt_root_name)
destination_tree_string = destination_tree.as_string(schema="newick",
suppress_rooting=True,
unquoted_underscores=True,
suppress_internal_node_labels=True).replace("'","").replace('\'', '')

with open(output_tree_filename, 'w+') as output_file:
print(destination_tree_string,
file=output_file,
end='')

def remove_internal_node_labels_from_tree(input_filename, output_filename):
tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True)
Expand Down
Loading

0 comments on commit 3db99f5

Please sign in to comment.