diff --git a/VERSION b/VERSION index 4a36342f..fd2a0186 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0 +3.1.0 diff --git a/python/gubbins/ValidateFastaAlignment.py b/python/gubbins/ValidateFastaAlignment.py index 8a5b2426..9d67e788 100644 --- a/python/gubbins/ValidateFastaAlignment.py +++ b/python/gubbins/ValidateFastaAlignment.py @@ -6,6 +6,7 @@ from Bio.Align import MultipleSeqAlignment class ValidateFastaAlignment(object): + def __init__(self, input_filename): self.input_filename = input_filename @@ -22,9 +23,8 @@ def is_input_fasta_file_valid(self): return False except: return False - return True - + def does_each_sequence_have_a_name_and_genomic_data(self): with open(self.input_filename, "r") as input_handle: alignments = AlignIO.parse(input_handle, "fasta") @@ -33,21 +33,19 @@ def does_each_sequence_have_a_name_and_genomic_data(self): for record in alignment: number_of_sequences +=1 if record.name is None or record.name == "": - print("Error with the input FASTA file: One of the sequence names is blank") + sys.stderr.write("Error with the input FASTA file: " + record.name + " is blank\n") return False if record.seq is None or record.seq == "": - print("Error with the input FASTA file: One of the sequences is empty") + sys.stderr.write("Error with the input FASTA file: " + record.name + " is empty\n") return False if re.search('[^ACGTNacgtn-]', str(record.seq)) != None: - print("Error with the input FASTA file: One of the sequences contains odd characters, only ACGTNacgtn- are permitted") + sys.stderr.write("Error with the input FASTA file: " + record.name + " contains disallowed characters, only ACGTNacgtn- are permitted\n") return False - input_handle.close() return True - + def does_each_sequence_have_the_same_length(self): try: with open(self.input_filename) as input_handle: - alignments = AlignIO.parse(input_handle, "fasta") sequence_length = -1 for alignment in alignments: @@ -63,17 +61,25 @@ def does_each_sequence_have_the_same_length(self): print("Error with the input FASTA file: It is in the wrong format so check its an alignment") return False return True - + def are_sequence_names_unique(self): - with open(self.input_filename) as input_handle: - alignments = AlignIO.parse(input_handle, "fasta") - sequence_names = [] - for alignment in alignments: + any_modified_names = False + with open(self.input_filename) as input_handle: + alignment = AlignIO.read(input_handle, "fasta") + sequence_names = [] for record in alignment: + # Remove disallowed characters + if '#' in record.name or '#' in record.name: + record.name = record.name.replace("#","_").replace(":","_") + record.id = record.id.replace("#", "_").replace(":", "_") + record.description = record.description.replace("#", "_").replace(":", "_") + any_modified_names = True + # Store modified names sequence_names.append(record.name) - if [k for k,v in list(Counter(sequence_names).items()) if v>1] != []: return False - input_handle.close() - return True + # Update alignment if names changed + with open(self.input_filename, "w") as output_handle: + AlignIO.write(alignment,output_handle, "fasta") + return True diff --git a/python/gubbins/common.py b/python/gubbins/common.py index 61a6d318..97ce1dba 100644 --- a/python/gubbins/common.py +++ b/python/gubbins/common.py @@ -256,8 +256,9 @@ def parse_and_run(input_args, program_description=""): # 3.5a. Joint ancestral reconstruction with new tree and info file in each iteration info_filename = model_fitter.get_info_filename(temp_working_dir,current_basename) recontree_filename = model_fitter.get_recontree_filename(temp_working_dir,current_basename) - # Arbitrary rooting of tree - reroot_tree_at_midpoint(recontree_filename) + # Set root of reconstruction tree to match that of the current tree + # Cannot just midpoint root both, because the branch lengths differ between them + harmonise_roots(recontree_filename, temp_rooted_tree) printer.print(["\nRunning joint ancestral reconstruction with pyjar"]) jar(alignment = polymorphism_alignment, # complete polymorphism alignment @@ -271,8 +272,12 @@ def parse_and_run(input_args, program_description=""): verbose = input_args.verbose) gaps_alignment_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.aln" raw_internal_rooted_tree_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.tre" - transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree, - current_tree_name_with_internal_nodes, "pyjar") + printer.print(["\nTransferring pyjar results onto original recombination-corrected tree"]) + transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, + temp_rooted_tree, + current_tree_name_with_internal_nodes, + "pyjar") + printer.print(["\nDone transfer"]) else: @@ -309,10 +314,12 @@ def parse_and_run(input_args, program_description=""): current_tree_name_with_internal_nodes, sequence_reconstructor) elif input_args.seq_recon == "iqtree" or input_args.seq_recon == "raxmlng": # IQtree returns an unrooted tree - temp_unrooted_tree = temp_working_dir + "/" + current_tree_name + ".unrooted" - unroot_tree(temp_rooted_tree, temp_unrooted_tree) - transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_unrooted_tree, - current_tree_name_with_internal_nodes, sequence_reconstructor) + harmonise_roots(raw_internal_rooted_tree_filename, temp_rooted_tree) + transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, + temp_rooted_tree, + current_tree_name_with_internal_nodes, + sequence_reconstructor, + use_root = False) else: sys.stderr.write("Unrecognised sequence reconstruction command: " + input_args.seq_recon + '\n') sys.exit() @@ -699,7 +706,6 @@ def reroot_tree_with_outgroup(tree_name, outgroups): with open(tree_name, 'w+') as output_file: output_file.write(output_tree_string.replace('\'', '')) - def reroot_tree_at_midpoint(tree_name): tree = dendropy.Tree.get_from_path(tree_name, 'newick', preserve_underscores=True) split_all_non_bi_nodes(tree.seed_node) @@ -718,6 +724,50 @@ def unroot_tree(input_filename, output_filename): with open(output_filename, 'w+') as output_file: output_file.write(output_tree_string.replace('\'', '')) +def harmonise_roots(new_tree_fn, tree_for_root_fn): + # Read in tree and get nodes adjacent to root + taxa = dendropy.TaxonNamespace() + tree_for_root = dendropy.Tree.get_from_path(tree_for_root_fn, + 'newick', + preserve_underscores=True, + taxon_namespace=taxa, + rooting="force-rooted") + tree_for_root.encode_bipartitions() + root = tree_for_root.seed_node + root_adjacent_bipartitions = [] + for root_adjacent_node in root.child_nodes(): + root_adjacent_bipartitions.append(root_adjacent_node.edge.bipartition) + # Now search the new tree for these nodes + new_tree = dendropy.Tree.get_from_path(new_tree_fn, + 'newick', + preserve_underscores=True, + taxon_namespace=taxa, + rooting="force-rooted") + new_tree.encode_bipartitions() + for node in new_tree.preorder_node_iter(): + if node.edge.bipartition in root_adjacent_bipartitions: + half_branch_length = node.edge_length/2 + new_tree.reroot_at_edge(node.edge, + length1 = half_branch_length, + length2 = half_branch_length) + break + new_tree_string = tree_as_string(new_tree, + suppress_internal=False, + suppress_rooting=False) + + # Check both trees are topologically identical + missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(tree_for_root, + new_tree, + is_bipartitions_updated=False) + + if len(missing_bipartitions) > 0: + sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions])) + sys.exit(1) + + # Write output + with open(new_tree_fn, 'w+') as output_file: + output_file.write(new_tree_string.replace('\'', '')) + def filter_out_removed_taxa_from_tree(input_filename, output_filename, taxa_removed): tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True) tree.prune_taxa_with_labels(taxa_removed, update_bipartitions=True) @@ -791,32 +841,70 @@ def get_monophyletic_outgroup(tree_name, outgroups): def transfer_internal_node_labels_to_tree(source_tree_filename, destination_tree_filename, output_tree_filename, - sequence_reconstructor): - + sequence_reconstructor, use_root = True): # read source tree and extract node labels, to match with the ancestral sequence reconstruction - source_tree = dendropy.Tree.get_from_path(source_tree_filename, 'newick', preserve_underscores=True) - source_internal_node_labels = [] + taxa = dendropy.TaxonNamespace() + source_tree = dendropy.Tree.get_from_path(source_tree_filename, + 'newick', + preserve_underscores=True, + taxon_namespace=taxa, + rooting='force-rooted') + source_tree.encode_bipartitions() + source_internal_node_dict = {} for source_internal_node in source_tree.internal_nodes(): + node_bipartition = source_internal_node.edge.bipartition if source_internal_node.label: - source_internal_node_labels.append(source_internal_node.label) + source_internal_node_dict[node_bipartition] = source_internal_node.label else: - source_internal_node_labels.append('') - + source_internal_node_dict[node_bipartition] = '' # read original tree and add in the labels from the ancestral sequence reconstruction - destination_tree = dendropy.Tree.get_from_path(destination_tree_filename, 'newick', preserve_underscores=True) - for index, destination_internal_node in enumerate(destination_tree.internal_nodes()): - if sequence_reconstructor == 'pyjar': - new_label = str(source_internal_node_labels[index]) - else: - new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_labels[index])) - destination_internal_node.label = None - destination_internal_node.taxon = dendropy.Taxon(new_label) + destination_tree = dendropy.Tree.get_from_path(destination_tree_filename, + 'newick', + preserve_underscores=True, + taxon_namespace=taxa, + rooting='force-rooted') + destination_tree.encode_bipartitions() + + # Check both trees are topologically identical + missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(source_tree, + destination_tree, + is_bipartitions_updated=False) + if len(missing_bipartitions) > 0: + sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions])) + sys.exit(1) + + root_alternative = '' + for destination_internal_node in destination_tree.internal_nodes(): + if destination_internal_node != destination_tree.seed_node or use_root: + node_bipartition = destination_internal_node.edge.bipartition + if sequence_reconstructor == 'pyjar': + try: + new_label = source_internal_node_dict[node_bipartition] + except: + sys.stderr.write('Unable to find bipartition ' + str(node_bipartition) + '\n') + sys.exit(1) + else: + new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_dict[node_bipartition])) + destination_internal_node.label = None + destination_internal_node.taxon = dendropy.Taxon(new_label) # output final tree - output_tree_string = tree_as_string(destination_tree, suppress_internal=False, suppress_rooting=False) - with open(output_tree_filename, 'w+') as output_file: - output_file.write(output_tree_string.replace('\'', '')) + if not use_root: + alt_root_name = '' + for root_child_node in destination_tree.seed_node.child_nodes(): + alt_root_name = root_child_node.taxon.label + break + destination_tree.seed_node.label = None + destination_tree.seed_node.taxon = dendropy.Taxon(alt_root_name) + destination_tree_string = destination_tree.as_string(schema="newick", + suppress_rooting=True, + unquoted_underscores=True, + suppress_internal_node_labels=True).replace("'","").replace('\'', '') + with open(output_tree_filename, 'w+') as output_file: + print(destination_tree_string, + file=output_file, + end='') def remove_internal_node_labels_from_tree(input_filename, output_filename): tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True) diff --git a/python/gubbins/pyjar.py b/python/gubbins/pyjar.py index daac2137..a8128b34 100755 --- a/python/gubbins/pyjar.py +++ b/python/gubbins/pyjar.py @@ -20,7 +20,7 @@ NumpyShared = collections.namedtuple('NumpyShared', ('name', 'shape', 'dtype')) except ImportError as e: sys.stderr.write("This version of Gubbins requires the multiprocessing library and python v3.8 or higher for memory management\n") - sys.exit(0) + sys.exit(201) from gubbins.utils import generate_shared_mem_array @@ -31,31 +31,38 @@ def read_alignment(filename, file_type, verbose=False): if not os.path.isfile(filename): print("Error: alignment file " + filename + " does not exist") - sys.exit() + sys.exit(202) if verbose: print("Trying to open file " + filename + " as " + file_type) try: - alignmentObject = AlignIO.read(open(filename), file_type) + with open(filename,'r') as aln_in: + alignmentObject = AlignIO.read(aln_in, file_type) if verbose: print("Alignment read successfully") except: print("Cannot open alignment file " + filename + " as " + file_type) - sys.exit() + sys.exit(203) return alignmentObject #Calculate Pij from Q matrix and branch length def calculate_pij(branch_length,rate_matrix): if branch_length==0: - return numpy.array([[1, 0, 0, 0,], [0, 1, 0, 0,], [0, 0, 1, 0,], [0, 0, 0, 1,]]) + return numpy.array([[0.0, float("-inf"), float("-inf"), float("-inf"),], + [float("-inf"), 0.0, float("-inf"), float("-inf"),], + [float("-inf"), float("-inf"), 0.0, float("-inf"),], + [float("-inf"), float("-inf"), float("-inf"), 0.0,]]) else: - return numpy.log(linalg.expm(numpy.multiply(branch_length,rate_matrix))) + return numpy.log(linalg.expm(numpy.multiply(branch_length,rate_matrix))) # modified #Read the tree file and root def read_tree(treefile): if not os.path.isfile(treefile): print("Error: tree file does not exist") - sys.exit() - t=dendropy.Tree.get(path=treefile, schema="newick", preserve_underscores=True, rooting="force-rooted") + sys.exit(204) + t=dendropy.Tree.get(path=treefile, + schema="newick", + preserve_underscores=True, + rooting="force-rooted") return t # Read the RAxML info file to get rates and frequencies @@ -63,11 +70,11 @@ def read_info(infofile, type = 'raxml'): if not os.path.isfile(infofile): print("Error: model information file " + infofile + " does not exist") - sys.exit() + sys.exit(205) if type not in ['raxml', 'raxmlng', 'iqtree','fasttree']: sys.stderr.write('Only able to parse GTR-type models from raxml, iqtree or fasttree') - sys.exit() + sys.exit(206) r=[-1.0] * 6 # initialise rates f=[-1.0] * 4 # initialise frequencies @@ -146,7 +153,7 @@ def read_info(infofile, type = 'raxml'): # Check frequencies and rates have been extracted correctly if -1.0 in f or -1.0 in r: sys.stderr.write('Problem with extracting model parameters - frequencies are ' + str(f) + ' and rates are ' + str(r)) - sys.exit() + sys.exit(207) return f, r @@ -239,7 +246,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence # Iterate over columns for column,base_pattern_columns_padded in zip(columns,column_positions): - + ### TIMING if verbose: calc_time_start = time.process_time() @@ -285,6 +292,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence #1b. Set for each amino acid i: Ly(i) = Pij(ty), where ty is the branch length between y and its father. node.L={"A": pij[base_matrix["A"]][base_matrix[base[taxon]]], "C": pij[base_matrix["C"]][base_matrix[base[taxon]]], "G": pij[base_matrix["G"]][base_matrix[base[taxon]]], "T": pij[base_matrix["T"]][base_matrix[base[taxon]]]} + else: node.C={"A": "A", "C": "C", "G": "G", "T": "T"} @@ -292,7 +300,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence except KeyError: print("Cannot find", taxon, "in base") - sys.exit() + sys.exit(208) else: node.L={} @@ -327,7 +335,6 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence c+=child.L[end] for start in columnbases: j=log(base_frequencies[base_matrix[end]])+c - if j>node.L[start]: node.L[start]=j node.C[start]=end @@ -339,7 +346,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence max_root_base_likelihood=node.L[root_base] max_root_base=node.C[root_base] node.r=max_root_base - + #Traverse the tree from the root in the direction of the OTUs, assigning to each node its most likely ancestral character as follows: for node in tree.preorder_node_iter(): @@ -354,8 +361,6 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence rootlens=[] for child in tree.seed_node.child_node_iter(): rootlens.append([child.edge_length,child,child.r]) - rootlens.sort(key = lambda x: x[0]) - tree.seed_node.r=rootlens[-1][1].r ### TIMING if verbose: @@ -480,7 +485,7 @@ def jar(alignment = None, base_patterns = None, base_pattern_positions = None, t node.taxon=tree.taxon_namespace.get_taxon(nodename) if nodename in alignment_sequence_names: print(nodename, "already in alignment. Quitting") - sys.exit() + sys.exit(209) ancestral_node_names.append(nodename) # index for reconstruction if node.parent_node != None: node.pij=calculate_pij(node.edge_length, rm) @@ -553,10 +558,17 @@ def jar(alignment = None, base_patterns = None, base_pattern_positions = None, t continue # Print tree + from gubbins.common import tree_as_string + if verbose: print("Printing tree with internal nodes labelled: ", output_prefix+".joint.tre") with open(output_prefix+".joint.tre", "w") as tree_output: - print(tree.as_string(schema="newick", suppress_rooting=True, unquoted_underscores=True, suppress_internal_node_labels=True).replace("'",""), file=tree_output) + + recon_tree = tree_as_string(tree, + suppress_rooting=True, + suppress_internal=False) + print(recon_tree.replace('\'', ''), + file = tree_output) if verbose: print("Done") diff --git a/python/gubbins/run_gubbins.py b/python/gubbins/run_gubbins.py index 5cf6d854..15f21c44 100755 --- a/python/gubbins/run_gubbins.py +++ b/python/gubbins/run_gubbins.py @@ -19,6 +19,7 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # +import sys import argparse from gubbins.__init__ import version import gubbins.common diff --git a/python/gubbins/treebuilders.py b/python/gubbins/treebuilders.py index fea26455..e6511f8a 100644 --- a/python/gubbins/treebuilders.py +++ b/python/gubbins/treebuilders.py @@ -629,8 +629,7 @@ def __init__(self, threads: 1, model: str, bootstrap = 0, internal_node_prefix = self.citation = "https://doi.org/10.1093/bioinformatics/btz305" # Set parallelisation - if self.threads > 1: - command.extend(["--threads", str(self.threads)]) + command.extend(["--threads", str(self.threads)]) # Add model command.extend(["-model"]) @@ -706,6 +705,7 @@ def select_executable_based_on_threads(self): def convert_raw_ancestral_states_to_fasta(self, input_filename, output_filename): """Converts the file containing ancestral sequences into FASTA format""" + # Use the IUPAC codes from here: https://www.bioinformatics.org/sms/iupac.html with open(input_filename, 'r') as infile: with open(output_filename, 'w+') as outfile: for sequence_line in infile: @@ -714,7 +714,13 @@ def convert_raw_ancestral_states_to_fasta(self, input_filename, output_filename) 'R': 'N', 'Y': 'N', 'S': 'N', - 'W': 'N' + 'W': 'N', + 'K': 'N', + 'M': 'N', + 'B': 'N', + 'D': 'N', + 'H': 'N', + 'V': 'N' } ) ) diff --git a/python/gubbins/utils.py b/python/gubbins/utils.py index 1f0bbab5..50c69843 100644 --- a/python/gubbins/utils.py +++ b/python/gubbins/utils.py @@ -108,18 +108,18 @@ def choose_executable_based_on_processor(list_of_executables: list): stdout=subprocess.PIPE, shell=True).communicate()[0].decode("utf-8") flags = output.lower().split() - if cpu_info: - # Iterate through list to match with CPU features - for executable in list_of_executables: - if 'AVX2' in executable and 'avx2' in flags and which(executable): - break - elif 'AVX' in executable and 'avx' in flags and which(executable): - break - elif 'SSE3' in executable and 'sse3' in flags and which(executable): - break - else: - # Final executable on list is generic - executable = list_of_executables[-1] + + # Iterate through list to match with CPU features + for executable in list_of_executables: + if cpu_info and 'AVX2' in executable and 'avx2' in flags and which(executable): + break + elif cpu_info and 'AVX' in executable and 'avx' in flags and which(executable): + break + elif cpu_info and 'SSE3' in executable and 'sse3' in flags and which(executable): + break + # to enable selection of raxml-ng-mpi + elif '-mpi' in executable and which(executable): + break # Check executable is available if which(executable): diff --git a/python/scripts/extract_clade.py b/python/scripts/extract_clade.py new file mode 100644 index 00000000..a194bf7d --- /dev/null +++ b/python/scripts/extract_clade.py @@ -0,0 +1,115 @@ +# encoding: utf-8 +# Wellcome Trust Sanger Institute and Imperial College London +# Copyright (C) 2020 Wellcome Trust Sanger Institute and Imperial College London +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# + +# Generic imports +import sys +import argparse +import re +# Phylogenetic imports +import dendropy +# Biopython imports +from Bio import AlignIO +from Bio import Phylo +from Bio import SeqIO +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Mask recombinant regions detected by' + ' Gubbins from the input alignment', + prog='mask_gubbins_aln') + + # input options + parser.add_argument('--list', + help = 'List of sequences to extract', + required = True) + parser.add_argument('--aln', + help = 'Input alignment (FASTA format)', + required = True) + parser.add_argument('--gff', + help = 'GFF of recombinant regions detected by Gubbins', + required = True) + parser.add_argument('--tree', + help = 'Final tree generated by Gubbins', + required = True) + parser.add_argument('--out', + help = 'Output file prefix', + required = True) + parser.add_argument('--out-fmt', + help = 'Format of output alignment', + default = 'fasta') + parser.add_argument('--missing-char', + help = 'Character used to replace recombinant sequence', + default = '-') + + return parser.parse_args() + +# main code +if __name__ == "__main__": + + # Get command line options + args = get_options() + + # Parse list of input sequences + subset = set() + # Read in FASTA assemblies + with open(args.list,'r') as seq_list: + for line in seq_list.readlines(): + subset.add(line.strip().split()[0]) + + # Extract from alignment + output_aln_name = args.out + '.aln' + names_in_alignment = set() + with open(output_aln_name,'w') as out_aln: + alignment = AlignIO.read(args.aln,'fasta') + for taxon in alignment: + names_in_alignment.add(taxon.id) + if taxon.id in subset: + SeqIO.write(taxon, out_aln, args.out_fmt) + + # Check subset sequences are found in alignment + not_found_in_dataset = subset - names_in_alignment + if len(not_found_in_dataset) > 0: + sys.stderr.write('Sequences in subset missing from alignment: ' + \ + str(not_found_in_dataset) + '\n') + sys.exit(1) + + # Prune from the tree + output_tree_name = args.out + '.tree' + tree = dendropy.Tree.get(path = args.tree, + schema = 'newick', + preserve_underscores = True) + tree.retain_taxa_with_labels(subset) + tree.write_to_path(output_tree_name, + 'newick') + + # Identify relevant recombination blocks + output_gff_name = args.out + '.gff' + taxon_pattern = re.compile('taxa="([^"]*)"') + with open(args.gff,'r') as in_gff, open(output_gff_name,'w') as out_gff: + for line in in_gff.readlines(): + if line.startswith('##'): + out_gff.write(line) + else: + info = line.rstrip().split('\t') + taxon_set = set(taxon_pattern.search(info[8]).group(1).split()) + if not taxon_set.isdisjoint(subset): + out_gff.write(line) diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py new file mode 100644 index 00000000..d3dfeed5 --- /dev/null +++ b/python/scripts/generate_ska_alignment.py @@ -0,0 +1,152 @@ +# encoding: utf-8 +# Wellcome Trust Sanger Institute and Imperial College London +# Copyright (C) 2020 Imperial College London and Imperial College London +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# + +# Generic imports +import sys +import os +import argparse +import subprocess +from multiprocessing import Pool +from functools import partial +from shutil import which + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Generate a ska alignment from a list ' + 'of assemblies', + prog='generate_ska_alignment') + + # input options + parser.add_argument('--reference', + help = 'Name of reference sequence to use for alignment', + required = True) + parser.add_argument('--fasta', + help = 'List of FASTA files to include in alignment', + default = None, + required = False) + parser.add_argument('--fastq', + help = 'List of FASTQ files to include in alignment', + default = None, + required = False) + parser.add_argument('--out', + help = 'Name of output alignment', + required = True) + parser.add_argument('--k', + help = 'Split kmer size', + type = int, + default = 15) + parser.add_argument('--threads', + help = 'Number of threads to use', + type = int, + default = 1) + parser.add_argument('--no-cleanup', + help = 'Do not remove intermediate files', + action = 'store_true', + default = False) + + return parser.parse_args() + +def map_fasta_sequence(seq, k = None, names = None): + subprocess.check_output('ska fasta -o ' + names[seq] + ' -k ' + str(k) + ' ' + seq, + shell = True) + +def map_fastq_sequence(read_pair, k = None, names = None): + subprocess.check_output('ska fastq -o ' + names[read_pair[0]] + ' -k ' + str(k) + ' ' + \ + read_pair[0] + ' ' + read_pair[1], + shell = True) + +def ska_map_sequences(seq, k = None, ref = None): + subprocess.check_output('ska map -o ' + seq + '.map -k ' + str(k) + ' -r ' + ref + \ + ' ' + seq + '.skf', + shell = True) + +# main code +if __name__ == "__main__": + + # Get command line options + args = get_options() + + # Check if ska is installed + if which('ska') is None: + sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"') + sys.exit(1) + + # Dictionary for sequence names + seq_names = {} + all_names = [] + + # Make split kmers from assemblies + fasta_names = [] + if args.fasta is not None: + # Read in FASTA assemblies + with open(args.fasta,'r') as fasta_list: + for line in fasta_list.readlines(): + info = line.strip().split() + if os.path.isfile(info[1]): + fasta_names.append(info[1]) + seq_names[info[1]] = info[0] + all_names.append(info[0]) + else: + sys.stderr.write('Unable to find file ' + info[1] + '\n') + # Sketch into split kmers + with Pool(processes = args.threads) as pool: + pool.map(partial(map_fasta_sequence, + k = args.k, + names = seq_names), + fasta_names) + + # Make split kmers from FASTQs + fastq_names = [] + if args.fastq is not None: + # Read in FASTQ reads + with open(args.fastq,'r') as fastq_list: + for line in fastq_list.readlines(): + info = line.strip().split() + if os.path.isfile(fns[1]) and os.path.isfile(fns[2]): + fastq_names.append((info[1],info[2])) + seq_names[info[1]] = info[0] + all_names.append(info[0]) + else: + sys.stderr.write('Unable to find files ' + fns[1] + ' and ' + fns[2] + '\n') + # Sketch into split kmers + with Pool(processes = args.threads) as pool: + return_codes = pool.map(partial(map_fastq_sequence, + k = args.k, + names = seq_names), + fastq_names) + + # Map sequences + with Pool(processes = args.threads) as pool: + return_codes = pool.map(partial(ska_map_sequences, + k = args.k, + ref = args.reference), + all_names) + + # Generate alignment + subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out, + shell = True) + + # Clean up + if not args.no_cleanup: + subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]), + shell = True) + subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]), + shell = True) + diff --git a/python/scripts/mask_gubbins_aln.py b/python/scripts/mask_gubbins_aln.py new file mode 100644 index 00000000..cd404807 --- /dev/null +++ b/python/scripts/mask_gubbins_aln.py @@ -0,0 +1,87 @@ +# encoding: utf-8 +# Wellcome Trust Sanger Institute and Imperial College London +# Copyright (C) 2020 Wellcome Trust Sanger Institute and Imperial College London +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# + +# Generic imports +import sys +import argparse +import re +# Biopython imports +from Bio import AlignIO +from Bio import Phylo +from Bio import SeqIO +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Mask recombinant regions detected by' + ' Gubbins from the input alignment', + prog='mask_gubbins_aln') + + # input options + parser.add_argument('--aln', + help = 'Input alignment (FASTA format)', + required = True) + parser.add_argument('--gff', + help = 'GFF of recombinant regions detected by Gubbins', + required = True) + parser.add_argument('--out', + help = 'Output file name', + required = True) + parser.add_argument('--out-fmt', + help = 'Format of output alignment', + default = 'fasta') + parser.add_argument('--missing-char', + help = 'Character used to replace recombinant sequence', + default = '-') + + return parser.parse_args() + +# main code +if __name__ == "__main__": + + # Get command line options + args = get_options() + + # Read in alignment + overall_taxon_list = [] + alignment = AlignIO.read(args.aln,'fasta') + for taxon in alignment: + overall_taxon_list.append(taxon.id) + taxon.seq = taxon.seq.tomutable() + overall_taxon_set = set(overall_taxon_list) + + # Read recombinant regions from GFF + taxon_pattern = re.compile('taxa="([^"]*)"') + with open(args.gff,'r') as gff_file: + for line in gff_file.readlines(): + if not line.startswith('##'): + info = line.rstrip().split('\t') + start = int(info[3])-1 + end = int(info[4])-1 + taxon_set = set(taxon_pattern.search(info[8]).group(1).split()) + if taxon_set.issubset(overall_taxon_set): + for taxon in alignment: + if taxon.id in taxon_set: + taxon.seq[start:end+1] = args.missing_char*(end-start+1) + + # Write out masked sequence + with open(args.out,'w') as out_aln: + AlignIO.write(alignment, out_aln, args.out_fmt)