maxplanck-ie · LeilyR · Feb 23, 2021 · Sep 17, 2019 · Sep 17, 2019 · Sep 18, 2019
diff --git a/README.md b/README.md
@@ -6,6 +6,11 @@ appended by orthogonal data, like associated gene expression. ultraheatmap
 facilitates adding orthogonal data to a deepTools matrix and allows to cluster a
 genomic heatmap by selected samples in just one single command-line call.
 
+<p align="center">
+  <img src="./docs/content/images/ultraheatmap_ferrari.png">
+</p>
+
+This figure has been generated by the ultraheatmap on real data. The first two columns show the ChIPs signal while the last column shows the gene expression of the closest gene to each region of any cluster.
 
 ## Getting Started
 
@@ -35,6 +40,7 @@ To install the program in this environment:
 
               $ python setup.py install
 
+
 from the ultraheatmap directory.
 
 
@@ -230,7 +236,6 @@ clustering algorithm with 2 clusters (`--kmeans 2`) based on the signal of first
                 A bed file to save the closest genes (default: None)
 
 
-
 example
 
               $ addFeatureToMatrix -m  deeptools_matrix.gz -o appended_matrix.gz \
@@ -239,4 +244,4 @@ example
 
 The above command adds extra columns to the input matrix. The output will be a matrix with
 `deeptools` format which can be visualized by `deeptools plotHeatmap`. The extra columns could be gene expression. If annotation file is provided, program finds the closest gene for each region of the input matrix and looks for the gene expression of that gene from the given feature tables.
-If no annotation is given, program checks for the exact match between regions name from the input matrix and finds the same name on the given feature tables.
+If no annotation file is given, program checks for the exact match between regions name from the input matrix and finds the same name on the given feature tables. The above image presents a matrix which has been generated by this module after mapping genes to peaks on real data.
diff --git a/docs/content/images/ultraheatmap_ferrari.png b/docs/content/images/ultraheatmap_ferrari.png
diff --git a/requirements.yaml b/requirements.yaml
@@ -10,3 +10,4 @@ dependencies:
   - pybedtools=0.7.10
   - pybigwig=0.3.10
   - pyyaml >= 5.1
+  - pysam >= 0.16.0.1
diff --git a/ultraheatmap/addFeatureToMatrix.py b/ultraheatmap/addFeatureToMatrix.py
@@ -127,7 +127,6 @@ def main():
                                             args.annotationOutput,
                                             args.referencePoint,
                                             args.closestGenesOutput)  # XXX instead of all these arguments i can simply add args.
-
         # paste an extra column per table to the input matrix
         extract_ge_folchange_per_peak(regions, args.tables, closestMapping,
                                       args.Features, args.idcolumn, hm)

diff --git a/ultraheatmap/mapClosestGenes.py b/ultraheatmap/mapClosestGenes.py
@@ -2,25 +2,41 @@
 from collections import defaultdict
 
 def __splitClosestMapping(closestMapping, col_split):
-    aInterval = [str(x).split('\t')[0:col_split] for x in closestMapping]
+    aInterval = []
+    bInterval = []
+
+    for x in closestMapping:
+        a = str(x).split('\t')[0:col_split]
+        b = str(x).split('\t')[col_split:]
+        if '-1' not in b:
+            aInterval.append(';'.join(str(v) for v in a))
+            bInterval.append(b)
+        else:
+            print("region ", a , "got no gene to be found as its closest gene.")
 
-    bInterval = [str(x).split('\t')[col_split:] for x in closestMapping] #XXX what if there is no closest gene for a peak? the current code crashes if there is such a peak
     gff = [pybedtools.create_interval_from_list(i) for i in bInterval]
 
     return({'A': aInterval,'B': gff})
 
-def __keymap_from_bed_and_gff(peaks, gff, key_definition = 'gene_id'):
-    assert (len(peaks) == len(gff)), ("{} and {} are not the same size").format(len(peaks), len(gff))
+
+def __keymap_from_bed_and_gff(peaks, aInterval, gff, key_definition = 'gene_id'):
+    count = 0
     keyMap = defaultdict(lambda: [])
-    for i in range(0,len(peaks)):
-        ckey = ';'.join(peaks[i][0:7]) ##XXX shall we keep the number hard coded??
-        cval = gff[i][key_definition]
+
+    for peak in peaks:
+        ckey = ';'.join(str(v) for v in peak)
+        if ckey in aInterval:
+            i = aInterval.index(ckey)
+            cval = gff[i][key_definition]
+        else:
+            cval = "no_gene"
+            count += 1
         keyMap[ckey] = cval
 
     return(keyMap)
 
-def keymap_from_closest_genes(closestMapping, peaks):
-    assert (type(peaks) is type(pybedtools.BedTool())), ("{} is not class {}").format(type(peaks), type(pybedtools.BedTool()))
-    splitDict = __splitClosestMapping(closestMapping, peaks.field_count())
-    keyMap_closest = __keymap_from_bed_and_gff(splitDict['A'], splitDict['B'])
+
+def keymap_from_closest_genes(closestMapping, peaks, field_count):
+    splitDict = __splitClosestMapping(closestMapping, field_count)
+    keyMap_closest = __keymap_from_bed_and_gff(peaks, splitDict['A'], splitDict['B'])
     return(keyMap_closest)
diff --git a/ultraheatmap/parseTables.py b/ultraheatmap/parseTables.py
@@ -83,13 +83,15 @@ def extract_ge_folchange_per_peak(peaks, tables, closestMapping, features,
     ## peak_keys: cols 1-7 from bed format (1-based index)
     Peaks = BedTool(peaks)
     Peaks=Peaks.sort()
-    keyMap_closest = keymap_from_closest_genes(closestMapping, Peaks)
+    field_count = Peaks.field_count()
+    keyMap_closest = keymap_from_closest_genes(closestMapping, peaks, field_count)
     __update_matrix_values(peaks, keyMap_closest, tables,features,IdColumn,hm)
 
 def __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn):
     """
 
     """
+    count = 0
     v = np.empty((len(peaks), len(features)), dtype=float)
     for i, peak in enumerate(peaks):
         key = ';'.join(map(str,peak))
@@ -99,9 +101,12 @@ def __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn):
                 x = float(table[table[IdColumn] == value][feature])
                 if np.isnan(x):
                      x = np.nan
+                else:
+                    count = count + 1
                 v[i,j] = x
         else:
             v[i] = [ np.nan ]*len(features)
+
     return v
 
 
@@ -129,10 +134,11 @@ def __update_matrix_values(peaks, keyMap_closest, tables, features, IdColumn, hm
     have been found.
     """
     assert len(keyMap_closest) == len(peaks)
-    valuesTab = np.empty((len(peaks), len(tables)*len(features)), dtype=float)
+    valuesTab = np.empty([len(peaks),len(tables)*len(features)], dtype= float)
     for i, table in enumerate(tables):
         table = parseTable(table)
         values = __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn)
+
         valuesTab[:,i*len(features):(i*len(features)+len(features))] = values
         for feature in features:
             hm.matrix.sample_labels = hm.matrix.sample_labels + ["table"+str(i)+"_"+feature]
@@ -142,11 +148,13 @@ def __update_matrix_values(peaks, keyMap_closest, tables, features, IdColumn, hm
     __update_parameters(hm,len(tables)*len(features))
 
 
-
-
 def parseTable(table_file):
+    """
+
+    """
     return(pd.read_csv(table_file, sep ='\t'))
 
+
 def parseMatrixRegions(regions):
     all_regions = []
     for group in  regions:
@@ -155,6 +163,7 @@ def parseMatrixRegions(regions):
                 all_regions.append([region[0],start, end, region[2], region[3], region[4], region[5]])
     return all_regions
 
+
 def __update_parameters(hm,length):
     """
     Updates matrix parameters
@@ -168,6 +177,7 @@ def __update_parameters(hm,length):
         hm.parameters['ref point'].append(None)
         hm.parameters['bin size'].append(10)
 
+
 def update_matrix_values(peaks, tables,features, IdColumn,hm):
     """
     The function is used for the one to one mapping between the name of the enriched regions
@@ -186,6 +196,7 @@ def update_matrix_values(peaks, tables,features, IdColumn,hm):
     hm.matrix.sample_boundaries = hm.matrix.sample_boundaries +[x+1+current_last_col for x in range(len(tables)*len(features))]
     __update_parameters(hm,len(tables)*len(features))
 
+
 def __read_tables_columns(tables, features):
     for table in tables:
          df = parseTable(table)