Fix on saving box2taxon when it is empty

Removed the old "condense_data" function - it was damaging results badly, when multiple functional columns were used No more "main_column"
iquasere · Jan 5, 2024 · 322bbfc · 322bbfc
1 parent 2ecfae9
commit 322bbfc
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 34 deletions.
diff --git a/keggcharter.py b/keggcharter.py
@@ -203,11 +203,9 @@ def further_information(
     data = get_cross_references(
         data, kegg_column=kegg_column, ko_column=ko_column, ec_column=ec_column, cog_column=cog_column, step=step,
         cog2ko_file=cog2ko_file, threads=threads)
-    main_column = kegg_column if kegg_column is not None else ko_column if ko_column is not None else ec_column
-    data = condense_data(data, main_column)
     timed_message(f'Saving new information to: {output}')
     data.to_csv(output, sep='\t', index=False)
-    return data, main_column
+    return data
 
 
 def split_list(a, n):
@@ -378,6 +376,8 @@ def ids_xref(
 def get_cross_references(
         data: pd.DataFrame, kegg_column: str = None, ko_column: str = None, ec_column: str = None,
         cog_column: str = None, cog2ko_file: str = None, threads: int = 15, step: int = 150) -> pd.DataFrame:
+    if not (kegg_column or ko_column or ec_column or cog_column):
+        sys.exit('Need to specify a column with either KEGG IDs, KOs, EC numbers or COGs!')
     ko_cols = []    # cols with KOs
     ec_cols = []    # cols with EC numbers
     if kegg_column:
@@ -396,34 +396,21 @@ def get_cross_references(
         data = ids_xref(
             data, in_col=cog_column, out_col='KO (cog-column)', in_type='cog', cog2ko_file=cog2ko_file,
             threads=threads)
+        ko_cols.append('KO (cog-column)')
+    data.drop_duplicates(inplace=True)
     # join all unique KOs in a column
     data['KO (KEGGCharter)'] = data[ko_cols].apply(
-        lambda x: ','.join([elem for elem in x if elem is not np.nan]), axis=1)
-    data['KO (KEGGCharter)'] = data['KO (KEGGCharter)'].apply(lambda x: ','.join(sorted(set(x.split(',')))))
+        lambda x: ','.join([elem for elem in x if type(elem) != float]), axis=1)
+    data['KO (KEGGCharter)'] = data['KO (KEGGCharter)'].apply(
+        lambda x: ','.join(sorted(set([val for val in x.split(',') if len(val) > 0]))))
     # join all unique ECs in a column
     data['EC number (KEGGCharter)'] = data[ec_cols].apply(
-        lambda x: ','.join(set([elem for elem in x if elem is not np.nan])), axis=1)
+        lambda x: ','.join([elem for elem in x if type(elem) != float]), axis=1)
     data['EC number (KEGGCharter)'] = data['EC number (KEGGCharter)'].apply(
-        lambda x: ','.join(sorted(set(x.split(',')))))
-    if not (kegg_column or ko_column or ec_column or cog_column):
-        sys.exit('Need to specify a column with either KEGG IDs, KOs, EC numbers or COGs!')
+        lambda x: ','.join(sorted(set([val for val in x.split(',') if len(val) > 0]))))
     return data
 
 
-def condense_data(data, main_column):
-    onlykos = data[data['KO (KEGGCharter)'].notnull() & (data['EC number (KEGGCharter)'].isnull())][
-        [main_column, 'KO (KEGGCharter)']]
-    onlykos = onlykos.groupby(main_column).agg({'KO (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index()
-    onlykos['EC number (KEGGCharter)'] = [np.nan] * len(onlykos)
-    wecs = data[data['EC number (KEGGCharter)'].notnull()][[main_column, 'KO (KEGGCharter)', 'EC number (KEGGCharter)']]
-    wecs = wecs.groupby(main_column).agg(
-        {'KO (KEGGCharter)': lambda x: ','.join(set([elem for elem in x if elem is not np.nan])),
-         'EC number (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index()
-    del data['KO (KEGGCharter)']
-    del data['EC number (KEGGCharter)']
-    return pd.merge(data, pd.concat([onlykos, wecs]), on=main_column, how='left').drop_duplicates()
-
-
 def prepare_data_for_charting(
         data: pd.DataFrame, mt_cols: str = None, ko_column: str = 'KO (KEGGCharter)',
         distribute_quantification: bool = False):
@@ -572,8 +559,7 @@ def download_resources(
     timed_message('Downloading resources')
     download_organism(resources_dir)
     taxa = ['ko'] + data[taxa_column].unique().tolist()
-    if np.nan in taxa:
-        taxa.remove(np.nan)
+    taxa = [taxon for taxon in taxa if taxa != np.nan]
     taxa_df = parse_organism(f'{resources_dir}/organism')
     taxon_to_mmap_to_orthologs = {}  # {'Keratinibaculum paraultunense' : {'00190': ['1', '2']}}
     if map_all:     # attribute all maps and all functions to all taxa, only limit by the data
@@ -675,7 +661,7 @@ def main():
         else:
             taxon_to_mmap_to_orthologs = None
     else:
-        data, main_column = further_information(
+        data = further_information(
             data,
             f'{args.output}/KEGGCharter_results.tsv',
             kegg_column=args.kegg_column,

diff --git a/keggpathway_map.py b/keggpathway_map.py
@@ -6,12 +6,11 @@
 import numpy as np
 import os
 from subprocess import run
-from matplotlib import pyplot as plt, colors, colormaps, cm
+from matplotlib import pyplot as plt, colors, colormaps
 import pandas as pd
 from re import search
 import sys
-import time
-from matplotlib.colors import PowerNorm, to_hex
+from matplotlib.colors import to_hex
 
 
 def set_bgcolor(pathway_element, color):
@@ -176,7 +175,7 @@ def taxa_colors(hex_values=None, ncolor=1):
         return [colors.to_hex(color_scheme(i)) for i in range(ncolor)]
     for hex_value in hex_values:
         if not search(r'^#(?:[0-9a-fA-F]{3}){1,2}$', hex_value):
-            sys.exit(Exception("Colors aren't valid hex codes"))
+            sys.exit("Colors aren't valid hex codes")
     return hex_values  # has validated hex values and returns the original list
 
 
@@ -286,7 +285,7 @@ def pathway_boxes_differential(self, df, colormap_name="viridis"):
         pathway
         :param colormap_name: str representing a costum matplotlib colormap to be used
         """
-        norm = cm.colors.Normalize(vmin=0, vmax=df.max().max())
+        norm = colors.Normalize(vmin=0, vmax=df.max().max())
         cmap = colormaps.get_cmap(colormap_name)
         # normalize values to put them between 0 and 1, and obtain RGB values
         df = pd.DataFrame([[val for val in vals] for vals in cmap(norm(df))], columns=df.columns, index=df.index)
@@ -406,6 +405,9 @@ def genomic_potential_taxa(
                             box2taxon[box].append(grey_taxa)
                         else:
                             box2taxon[box] = [grey_taxa]
+        if len(box2taxon) == 0:
+            print('No taxonomic information for this map!')
+            return
         name = self.name.split(':')[-1]
         self.pathway_box_list(box2taxon, dic_colors)  # for every box with KOs identified from the most abundant taxa, sub-boxes are created with colours of the corresponding taxa
         self.to_pdf(f'{output}/maps/potential_{name}.pdf')
@@ -430,7 +432,6 @@ def differential_colorbar(self, df, filename, colormap_name='viridis'):
         ax.remove()
         plt.savefig(filename, bbox_inches='tight')
 
-
     def differential_expression_sample(
             self, data, samples, ko_column, mmaps2taxa, taxa_column='Taxonomic lineage (GENUS)', output=None,
             colormap_name='viridis'):
@@ -453,7 +454,8 @@ def differential_expression_sample(
         df = df[df['Boxes'].notnull()]
         df = expand_by_list_column(df, column='Boxes')
         if len(df) == 0:
-            return 1
+            print('No differential information for this map!')
+            return
         df = df.groupby('Boxes')[samples].sum()
         name = self.name.split(':')[-1]
         df.to_csv(f'{output}/tsvs/differential_{name}.tsv', sep='\t')
@@ -464,7 +466,6 @@ def differential_expression_sample(
         self.add_legend(
             f'{output}/maps/differential_{name}.pdf', f'{output}/maps/differential_{name}_legend.png',
             f'{output}/maps/differential_{self.title.replace("/", "|")}.png')
-        return 0
 
     def add_legend(self, kegg_map_file, legend_file, output):
         """