Skip to content

Commit

Permalink
Fix on saving box2taxon when it is empty
Browse files Browse the repository at this point in the history
Removed the old "condense_data" function
- it was damaging results badly, when multiple functional columns were used
No more "main_column"
  • Loading branch information
iquasere committed Jan 5, 2024
1 parent 2ecfae9 commit 322bbfc
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 34 deletions.
38 changes: 12 additions & 26 deletions keggcharter.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,9 @@ def further_information(
data = get_cross_references(
data, kegg_column=kegg_column, ko_column=ko_column, ec_column=ec_column, cog_column=cog_column, step=step,
cog2ko_file=cog2ko_file, threads=threads)
main_column = kegg_column if kegg_column is not None else ko_column if ko_column is not None else ec_column
data = condense_data(data, main_column)
timed_message(f'Saving new information to: {output}')
data.to_csv(output, sep='\t', index=False)
return data, main_column
return data


def split_list(a, n):
Expand Down Expand Up @@ -378,6 +376,8 @@ def ids_xref(
def get_cross_references(
data: pd.DataFrame, kegg_column: str = None, ko_column: str = None, ec_column: str = None,
cog_column: str = None, cog2ko_file: str = None, threads: int = 15, step: int = 150) -> pd.DataFrame:
if not (kegg_column or ko_column or ec_column or cog_column):
sys.exit('Need to specify a column with either KEGG IDs, KOs, EC numbers or COGs!')
ko_cols = [] # cols with KOs
ec_cols = [] # cols with EC numbers
if kegg_column:
Expand All @@ -396,34 +396,21 @@ def get_cross_references(
data = ids_xref(
data, in_col=cog_column, out_col='KO (cog-column)', in_type='cog', cog2ko_file=cog2ko_file,
threads=threads)
ko_cols.append('KO (cog-column)')
data.drop_duplicates(inplace=True)
# join all unique KOs in a column
data['KO (KEGGCharter)'] = data[ko_cols].apply(
lambda x: ','.join([elem for elem in x if elem is not np.nan]), axis=1)
data['KO (KEGGCharter)'] = data['KO (KEGGCharter)'].apply(lambda x: ','.join(sorted(set(x.split(',')))))
lambda x: ','.join([elem for elem in x if type(elem) != float]), axis=1)
data['KO (KEGGCharter)'] = data['KO (KEGGCharter)'].apply(
lambda x: ','.join(sorted(set([val for val in x.split(',') if len(val) > 0]))))
# join all unique ECs in a column
data['EC number (KEGGCharter)'] = data[ec_cols].apply(
lambda x: ','.join(set([elem for elem in x if elem is not np.nan])), axis=1)
lambda x: ','.join([elem for elem in x if type(elem) != float]), axis=1)
data['EC number (KEGGCharter)'] = data['EC number (KEGGCharter)'].apply(
lambda x: ','.join(sorted(set(x.split(',')))))
if not (kegg_column or ko_column or ec_column or cog_column):
sys.exit('Need to specify a column with either KEGG IDs, KOs, EC numbers or COGs!')
lambda x: ','.join(sorted(set([val for val in x.split(',') if len(val) > 0]))))
return data


def condense_data(data, main_column):
onlykos = data[data['KO (KEGGCharter)'].notnull() & (data['EC number (KEGGCharter)'].isnull())][
[main_column, 'KO (KEGGCharter)']]
onlykos = onlykos.groupby(main_column).agg({'KO (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index()
onlykos['EC number (KEGGCharter)'] = [np.nan] * len(onlykos)
wecs = data[data['EC number (KEGGCharter)'].notnull()][[main_column, 'KO (KEGGCharter)', 'EC number (KEGGCharter)']]
wecs = wecs.groupby(main_column).agg(
{'KO (KEGGCharter)': lambda x: ','.join(set([elem for elem in x if elem is not np.nan])),
'EC number (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index()
del data['KO (KEGGCharter)']
del data['EC number (KEGGCharter)']
return pd.merge(data, pd.concat([onlykos, wecs]), on=main_column, how='left').drop_duplicates()


def prepare_data_for_charting(
data: pd.DataFrame, mt_cols: str = None, ko_column: str = 'KO (KEGGCharter)',
distribute_quantification: bool = False):
Expand Down Expand Up @@ -572,8 +559,7 @@ def download_resources(
timed_message('Downloading resources')
download_organism(resources_dir)
taxa = ['ko'] + data[taxa_column].unique().tolist()
if np.nan in taxa:
taxa.remove(np.nan)
taxa = [taxon for taxon in taxa if taxa != np.nan]
taxa_df = parse_organism(f'{resources_dir}/organism')
taxon_to_mmap_to_orthologs = {} # {'Keratinibaculum paraultunense' : {'00190': ['1', '2']}}
if map_all: # attribute all maps and all functions to all taxa, only limit by the data
Expand Down Expand Up @@ -675,7 +661,7 @@ def main():
else:
taxon_to_mmap_to_orthologs = None
else:
data, main_column = further_information(
data = further_information(
data,
f'{args.output}/KEGGCharter_results.tsv',
kegg_column=args.kegg_column,
Expand Down
17 changes: 9 additions & 8 deletions keggpathway_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
import numpy as np
import os
from subprocess import run
from matplotlib import pyplot as plt, colors, colormaps, cm
from matplotlib import pyplot as plt, colors, colormaps
import pandas as pd
from re import search
import sys
import time
from matplotlib.colors import PowerNorm, to_hex
from matplotlib.colors import to_hex


def set_bgcolor(pathway_element, color):
Expand Down Expand Up @@ -176,7 +175,7 @@ def taxa_colors(hex_values=None, ncolor=1):
return [colors.to_hex(color_scheme(i)) for i in range(ncolor)]
for hex_value in hex_values:
if not search(r'^#(?:[0-9a-fA-F]{3}){1,2}$', hex_value):
sys.exit(Exception("Colors aren't valid hex codes"))
sys.exit("Colors aren't valid hex codes")
return hex_values # has validated hex values and returns the original list


Expand Down Expand Up @@ -286,7 +285,7 @@ def pathway_boxes_differential(self, df, colormap_name="viridis"):
pathway
:param colormap_name: str representing a costum matplotlib colormap to be used
"""
norm = cm.colors.Normalize(vmin=0, vmax=df.max().max())
norm = colors.Normalize(vmin=0, vmax=df.max().max())
cmap = colormaps.get_cmap(colormap_name)
# normalize values to put them between 0 and 1, and obtain RGB values
df = pd.DataFrame([[val for val in vals] for vals in cmap(norm(df))], columns=df.columns, index=df.index)
Expand Down Expand Up @@ -406,6 +405,9 @@ def genomic_potential_taxa(
box2taxon[box].append(grey_taxa)
else:
box2taxon[box] = [grey_taxa]
if len(box2taxon) == 0:
print('No taxonomic information for this map!')
return
name = self.name.split(':')[-1]
self.pathway_box_list(box2taxon, dic_colors) # for every box with KOs identified from the most abundant taxa, sub-boxes are created with colours of the corresponding taxa
self.to_pdf(f'{output}/maps/potential_{name}.pdf')
Expand All @@ -430,7 +432,6 @@ def differential_colorbar(self, df, filename, colormap_name='viridis'):
ax.remove()
plt.savefig(filename, bbox_inches='tight')


def differential_expression_sample(
self, data, samples, ko_column, mmaps2taxa, taxa_column='Taxonomic lineage (GENUS)', output=None,
colormap_name='viridis'):
Expand All @@ -453,7 +454,8 @@ def differential_expression_sample(
df = df[df['Boxes'].notnull()]
df = expand_by_list_column(df, column='Boxes')
if len(df) == 0:
return 1
print('No differential information for this map!')
return
df = df.groupby('Boxes')[samples].sum()
name = self.name.split(':')[-1]
df.to_csv(f'{output}/tsvs/differential_{name}.tsv', sep='\t')
Expand All @@ -464,7 +466,6 @@ def differential_expression_sample(
self.add_legend(
f'{output}/maps/differential_{name}.pdf', f'{output}/maps/differential_{name}_legend.png',
f'{output}/maps/differential_{self.title.replace("/", "|")}.png')
return 0

def add_legend(self, kegg_map_file, legend_file, output):
"""
Expand Down

0 comments on commit 322bbfc

Please sign in to comment.