Skip to content

Commit

Permalink
Fix HOGs when species removed, resolves #602
Browse files Browse the repository at this point in the history
  • Loading branch information
davidemms committed Aug 6, 2021
1 parent 1b3f37c commit 2bc15f7
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 13 deletions.
5 changes: 4 additions & 1 deletion scripts_of/stride.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,10 @@ def GetRoot(speciesTreeFN, treesDir, GeneToSpeciesMap, nProcessors, qWriteDupTre
speciesTree = tree.Tree(speciesTreeFN, format=2)
qHaveBranchSupport = True
except:
speciesTree = tree.Tree(speciesTreeFN, format=1)
try:
speciesTree = tree.Tree(speciesTreeFN, format=1)
except:
print("ERROR: Species tree inference failed")
species, dict_clades, clade_names = AnalyseSpeciesTree(speciesTree)
pool = mp.Pool(nProcessors, maxtasksperchild=1)
list_of_dicts = pool.map(SupportedHierachies_wrapper2, [(fn, GeneToSpeciesMap, species, dict_clades, clade_names, qWriteDupTrees) for fn in glob.glob(treesDir + "/*")])
Expand Down
25 changes: 13 additions & 12 deletions scripts_of/trees2ologs_of.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def __init__(self, species_tree, species_tree_node_names, seq_ids, sp_ids, speci
os.mkdir(d)
self.fhs = dict()
self.iSps = list(map(str, sorted(species_to_use))) # list of strings
self.i_sp_to_index = {isp:i_col for i_col, isp in enumerate(self.iSps)}
self.i_sp_to_index = {int(isp):i_col for i_col, isp in enumerate(self.iSps)}
self.iHOG = defaultdict(int)
self.species_tree = species_tree
species_names = [sp_ids[i] for i in self.iSps]
Expand Down Expand Up @@ -205,8 +205,8 @@ def write_clade_v2(self, n, og_name, split_paralogous_clades_from_same_hog = Fal

# get scl & remove HOGs that can't be written yet due to duplications
# 0. Get the scl units below this node in the gene tree
# I.e. get genes (indexed by species) below each scl (the relevant gene tree nodes)
genes_per_species_index = self.get_descendant_genes(n)
# I.e. get genes (referenced by species ID) below each scl (the relevant gene tree nodes)
genes_per_species_id = self.get_descendant_genes(n)

# scl_mrca = {nn.sp_node for nn in scl if not nn.is_leaf()}
if debug: print("Dups below: " + str(n.dups_below))
Expand All @@ -225,7 +225,7 @@ def write_clade_v2(self, n, og_name, split_paralogous_clades_from_same_hog = Fal
return []

if debug: print(hogs_to_write)
return self.get_hog_file_entries(hogs_to_write, genes_per_species_index, og_name, n.name)
return self.get_hog_file_entries(hogs_to_write, genes_per_species_id, og_name, n.name)

def get_descendant_genes(self, n):
"""
Expand All @@ -234,27 +234,26 @@ def get_descendant_genes(self, n):
Args:
n - node under consideration
Returns:
dict:sp_index->string of genes, comma separated
dict:sp_id (int)->string of genes, comma separated
"""
genes_per_species = defaultdict(list) # iCol (before 'name' columns) -> text string of genes
genes = n.get_leaves()
q_have_legitimate_gene = False # may be misplaced genes
for g in genes:
if "X" in g.features: continue
q_have_legitimate_gene = True
isp = g.name.split("_")[0]
genes_per_species[self.i_sp_to_index[isp]].append(self.seq_ids[g.name])
isp = int(g.name.split("_")[0])
genes_per_species[isp].append(self.seq_ids[g.name])
for k, v in genes_per_species.items():
genes_per_species[k] = ", ".join(v)
return genes_per_species

def get_hog_file_entries(self, hogs_to_write, genes_per_species_index, og_name, gt_node_name):
def get_hog_file_entries(self, hogs_to_write, genes_per_species_id, og_name, gt_node_name):
"""
Write the HOGs that can be determined from this gene tree node.
Args:
hogs_to_write - list of HOG names
scl_units - dict:st_node_name->(dict:sp_index->genes string for HOGs file)
e.g. st_node_name is N5 for internal node, 11 for leaf
genes_per_species_id - dict:sp_id (int)->str, comma separated list of genes
og_name - OG name
gt_node_name - gene tree node name
Implementation:
Expand All @@ -268,12 +267,14 @@ def get_hog_file_entries(self, hogs_to_write, genes_per_species_index, og_name,
q_empty = True
# 2. We know the scl, these are the 'taxonomic units' available (clades or individual species in species tree for this node of the gene tree)
# Note there can be at most one of each. Only a subset of these will fall under this HOG.
units = self.hog_contents[h].intersection(genes_per_species_index.keys())
units = self.hog_contents[h].intersection(genes_per_species_id.keys())
# print("Units: " + str(units))
genes_row = ["" for _ in self.iSps]
# put the units into the row
for isp in units:
genes_row[isp] = genes_per_species_index[isp]
# translate the species ID to the species column it should be in
# after accounting for removed species
genes_row[self.i_sp_to_index[isp]] = genes_per_species_id[isp]
q_empty = False
if not q_empty:
# print((h, genes_row))
Expand Down

0 comments on commit 2bc15f7

Please sign in to comment.