diff --git a/keggcharter.py b/keggcharter.py index e9fd698..4da5326 100644 --- a/keggcharter.py +++ b/keggcharter.py @@ -203,11 +203,9 @@ def further_information( data = get_cross_references( data, kegg_column=kegg_column, ko_column=ko_column, ec_column=ec_column, cog_column=cog_column, step=step, cog2ko_file=cog2ko_file, threads=threads) - main_column = kegg_column if kegg_column is not None else ko_column if ko_column is not None else ec_column - data = condense_data(data, main_column) timed_message(f'Saving new information to: {output}') data.to_csv(output, sep='\t', index=False) - return data, main_column + return data def split_list(a, n): @@ -378,6 +376,8 @@ def ids_xref( def get_cross_references( data: pd.DataFrame, kegg_column: str = None, ko_column: str = None, ec_column: str = None, cog_column: str = None, cog2ko_file: str = None, threads: int = 15, step: int = 150) -> pd.DataFrame: + if not (kegg_column or ko_column or ec_column or cog_column): + sys.exit('Need to specify a column with either KEGG IDs, KOs, EC numbers or COGs!') ko_cols = [] # cols with KOs ec_cols = [] # cols with EC numbers if kegg_column: @@ -396,34 +396,21 @@ def get_cross_references( data = ids_xref( data, in_col=cog_column, out_col='KO (cog-column)', in_type='cog', cog2ko_file=cog2ko_file, threads=threads) + ko_cols.append('KO (cog-column)') + data.drop_duplicates(inplace=True) # join all unique KOs in a column data['KO (KEGGCharter)'] = data[ko_cols].apply( - lambda x: ','.join([elem for elem in x if elem is not np.nan]), axis=1) - data['KO (KEGGCharter)'] = data['KO (KEGGCharter)'].apply(lambda x: ','.join(sorted(set(x.split(','))))) + lambda x: ','.join([elem for elem in x if type(elem) != float]), axis=1) + data['KO (KEGGCharter)'] = data['KO (KEGGCharter)'].apply( + lambda x: ','.join(sorted(set([val for val in x.split(',') if len(val) > 0])))) # join all unique ECs in a column data['EC number (KEGGCharter)'] = data[ec_cols].apply( - lambda x: ','.join(set([elem for elem in x if elem is not np.nan])), axis=1) + lambda x: ','.join([elem for elem in x if type(elem) != float]), axis=1) data['EC number (KEGGCharter)'] = data['EC number (KEGGCharter)'].apply( - lambda x: ','.join(sorted(set(x.split(','))))) - if not (kegg_column or ko_column or ec_column or cog_column): - sys.exit('Need to specify a column with either KEGG IDs, KOs, EC numbers or COGs!') + lambda x: ','.join(sorted(set([val for val in x.split(',') if len(val) > 0])))) return data -def condense_data(data, main_column): - onlykos = data[data['KO (KEGGCharter)'].notnull() & (data['EC number (KEGGCharter)'].isnull())][ - [main_column, 'KO (KEGGCharter)']] - onlykos = onlykos.groupby(main_column).agg({'KO (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index() - onlykos['EC number (KEGGCharter)'] = [np.nan] * len(onlykos) - wecs = data[data['EC number (KEGGCharter)'].notnull()][[main_column, 'KO (KEGGCharter)', 'EC number (KEGGCharter)']] - wecs = wecs.groupby(main_column).agg( - {'KO (KEGGCharter)': lambda x: ','.join(set([elem for elem in x if elem is not np.nan])), - 'EC number (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index() - del data['KO (KEGGCharter)'] - del data['EC number (KEGGCharter)'] - return pd.merge(data, pd.concat([onlykos, wecs]), on=main_column, how='left').drop_duplicates() - - def prepare_data_for_charting( data: pd.DataFrame, mt_cols: str = None, ko_column: str = 'KO (KEGGCharter)', distribute_quantification: bool = False): @@ -572,8 +559,7 @@ def download_resources( timed_message('Downloading resources') download_organism(resources_dir) taxa = ['ko'] + data[taxa_column].unique().tolist() - if np.nan in taxa: - taxa.remove(np.nan) + taxa = [taxon for taxon in taxa if taxa != np.nan] taxa_df = parse_organism(f'{resources_dir}/organism') taxon_to_mmap_to_orthologs = {} # {'Keratinibaculum paraultunense' : {'00190': ['1', '2']}} if map_all: # attribute all maps and all functions to all taxa, only limit by the data @@ -675,7 +661,7 @@ def main(): else: taxon_to_mmap_to_orthologs = None else: - data, main_column = further_information( + data = further_information( data, f'{args.output}/KEGGCharter_results.tsv', kegg_column=args.kegg_column, diff --git a/keggpathway_map.py b/keggpathway_map.py index 9bc230e..bf6123a 100644 --- a/keggpathway_map.py +++ b/keggpathway_map.py @@ -6,12 +6,11 @@ import numpy as np import os from subprocess import run -from matplotlib import pyplot as plt, colors, colormaps, cm +from matplotlib import pyplot as plt, colors, colormaps import pandas as pd from re import search import sys -import time -from matplotlib.colors import PowerNorm, to_hex +from matplotlib.colors import to_hex def set_bgcolor(pathway_element, color): @@ -176,7 +175,7 @@ def taxa_colors(hex_values=None, ncolor=1): return [colors.to_hex(color_scheme(i)) for i in range(ncolor)] for hex_value in hex_values: if not search(r'^#(?:[0-9a-fA-F]{3}){1,2}$', hex_value): - sys.exit(Exception("Colors aren't valid hex codes")) + sys.exit("Colors aren't valid hex codes") return hex_values # has validated hex values and returns the original list @@ -286,7 +285,7 @@ def pathway_boxes_differential(self, df, colormap_name="viridis"): pathway :param colormap_name: str representing a costum matplotlib colormap to be used """ - norm = cm.colors.Normalize(vmin=0, vmax=df.max().max()) + norm = colors.Normalize(vmin=0, vmax=df.max().max()) cmap = colormaps.get_cmap(colormap_name) # normalize values to put them between 0 and 1, and obtain RGB values df = pd.DataFrame([[val for val in vals] for vals in cmap(norm(df))], columns=df.columns, index=df.index) @@ -406,6 +405,9 @@ def genomic_potential_taxa( box2taxon[box].append(grey_taxa) else: box2taxon[box] = [grey_taxa] + if len(box2taxon) == 0: + print('No taxonomic information for this map!') + return name = self.name.split(':')[-1] self.pathway_box_list(box2taxon, dic_colors) # for every box with KOs identified from the most abundant taxa, sub-boxes are created with colours of the corresponding taxa self.to_pdf(f'{output}/maps/potential_{name}.pdf') @@ -430,7 +432,6 @@ def differential_colorbar(self, df, filename, colormap_name='viridis'): ax.remove() plt.savefig(filename, bbox_inches='tight') - def differential_expression_sample( self, data, samples, ko_column, mmaps2taxa, taxa_column='Taxonomic lineage (GENUS)', output=None, colormap_name='viridis'): @@ -453,7 +454,8 @@ def differential_expression_sample( df = df[df['Boxes'].notnull()] df = expand_by_list_column(df, column='Boxes') if len(df) == 0: - return 1 + print('No differential information for this map!') + return df = df.groupby('Boxes')[samples].sum() name = self.name.split(':')[-1] df.to_csv(f'{output}/tsvs/differential_{name}.tsv', sep='\t') @@ -464,7 +466,6 @@ def differential_expression_sample( self.add_legend( f'{output}/maps/differential_{name}.pdf', f'{output}/maps/differential_{name}_legend.png', f'{output}/maps/differential_{self.title.replace("/", "|")}.png') - return 0 def add_legend(self, kegg_map_file, legend_file, output): """