Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #36

Merged
merged 14 commits into from
Feb 23, 2021
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ appended by orthogonal data, like associated gene expression. ultraheatmap
facilitates adding orthogonal data to a deepTools matrix and allows to cluster a
genomic heatmap by selected samples in just one single command-line call.

<p align="center">
<img src="./docs/content/images/ultraheatmap_ferrari.png">
</p>

This figure has been generated by the ultraheatmap on real data. The first two columns show the ChIPs signal while the last column shows the gene expression of the closest gene to each region of any cluster.

## Getting Started

Expand Down Expand Up @@ -35,6 +40,7 @@ To install the program in this environment:

$ python setup.py install


from the ultraheatmap directory.


Expand Down Expand Up @@ -230,7 +236,6 @@ clustering algorithm with 2 clusters (`--kmeans 2`) based on the signal of first
A bed file to save the closest genes (default: None)



example

$ addFeatureToMatrix -m deeptools_matrix.gz -o appended_matrix.gz \
Expand All @@ -239,4 +244,4 @@ example

The above command adds extra columns to the input matrix. The output will be a matrix with
`deeptools` format which can be visualized by `deeptools plotHeatmap`. The extra columns could be gene expression. If annotation file is provided, program finds the closest gene for each region of the input matrix and looks for the gene expression of that gene from the given feature tables.
If no annotation is given, program checks for the exact match between regions name from the input matrix and finds the same name on the given feature tables.
If no annotation file is given, program checks for the exact match between regions name from the input matrix and finds the same name on the given feature tables. The above image presents a matrix which has been generated by this module after mapping genes to peaks on real data.
Binary file added docs/content/images/ultraheatmap_ferrari.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions requirements.yaml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ dependencies:
- pybedtools=0.7.10
- pybigwig=0.3.10
- pyyaml >= 5.1
- pysam >= 0.16.0.1
1 change: 0 additions & 1 deletion ultraheatmap/addFeatureToMatrix.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ def main():
args.annotationOutput,
args.referencePoint,
args.closestGenesOutput) # XXX instead of all these arguments i can simply add args.

# paste an extra column per table to the input matrix
extract_ge_folchange_per_peak(regions, args.tables, closestMapping,
args.Features, args.idcolumn, hm)
Expand Down
38 changes: 27 additions & 11 deletions ultraheatmap/mapClosestGenes.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,41 @@
from collections import defaultdict

def __splitClosestMapping(closestMapping, col_split):
aInterval = [str(x).split('\t')[0:col_split] for x in closestMapping]
aInterval = []
bInterval = []

for x in closestMapping:
a = str(x).split('\t')[0:col_split]
b = str(x).split('\t')[col_split:]
if '-1' not in b:
aInterval.append(';'.join(str(v) for v in a))
bInterval.append(b)
else:
print("region ", a , "got no gene to be found as its closest gene.")

bInterval = [str(x).split('\t')[col_split:] for x in closestMapping] #XXX what if there is no closest gene for a peak? the current code crashes if there is such a peak
gff = [pybedtools.create_interval_from_list(i) for i in bInterval]

return({'A': aInterval,'B': gff})

def __keymap_from_bed_and_gff(peaks, gff, key_definition = 'gene_id'):
assert (len(peaks) == len(gff)), ("{} and {} are not the same size").format(len(peaks), len(gff))

def __keymap_from_bed_and_gff(peaks, aInterval, gff, key_definition = 'gene_id'):
count = 0
keyMap = defaultdict(lambda: [])
for i in range(0,len(peaks)):
ckey = ';'.join(peaks[i][0:7]) ##XXX shall we keep the number hard coded??
cval = gff[i][key_definition]

for peak in peaks:
ckey = ';'.join(str(v) for v in peak)
if ckey in aInterval:
i = aInterval.index(ckey)
cval = gff[i][key_definition]
else:
cval = "no_gene"
count += 1
keyMap[ckey] = cval

return(keyMap)

def keymap_from_closest_genes(closestMapping, peaks):
assert (type(peaks) is type(pybedtools.BedTool())), ("{} is not class {}").format(type(peaks), type(pybedtools.BedTool()))
splitDict = __splitClosestMapping(closestMapping, peaks.field_count())
keyMap_closest = __keymap_from_bed_and_gff(splitDict['A'], splitDict['B'])

def keymap_from_closest_genes(closestMapping, peaks, field_count):
splitDict = __splitClosestMapping(closestMapping, field_count)
keyMap_closest = __keymap_from_bed_and_gff(peaks, splitDict['A'], splitDict['B'])
return(keyMap_closest)
19 changes: 15 additions & 4 deletions ultraheatmap/parseTables.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,15 @@ def extract_ge_folchange_per_peak(peaks, tables, closestMapping, features,
## peak_keys: cols 1-7 from bed format (1-based index)
Peaks = BedTool(peaks)
Peaks=Peaks.sort()
keyMap_closest = keymap_from_closest_genes(closestMapping, Peaks)
field_count = Peaks.field_count()
keyMap_closest = keymap_from_closest_genes(closestMapping, peaks, field_count)
__update_matrix_values(peaks, keyMap_closest, tables,features,IdColumn,hm)

def __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn):
"""

"""
count = 0
v = np.empty((len(peaks), len(features)), dtype=float)
for i, peak in enumerate(peaks):
key = ';'.join(map(str,peak))
Expand All @@ -99,9 +101,12 @@ def __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn):
x = float(table[table[IdColumn] == value][feature])
if np.isnan(x):
x = np.nan
else:
count = count + 1
v[i,j] = x
else:
v[i] = [ np.nan ]*len(features)

return v


Expand Down Expand Up @@ -129,10 +134,11 @@ def __update_matrix_values(peaks, keyMap_closest, tables, features, IdColumn, hm
have been found.
"""
assert len(keyMap_closest) == len(peaks)
valuesTab = np.empty((len(peaks), len(tables)*len(features)), dtype=float)
valuesTab = np.empty([len(peaks),len(tables)*len(features)], dtype= float)
for i, table in enumerate(tables):
table = parseTable(table)
values = __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn)

valuesTab[:,i*len(features):(i*len(features)+len(features))] = values
for feature in features:
hm.matrix.sample_labels = hm.matrix.sample_labels + ["table"+str(i)+"_"+feature]
Expand All @@ -142,11 +148,13 @@ def __update_matrix_values(peaks, keyMap_closest, tables, features, IdColumn, hm
__update_parameters(hm,len(tables)*len(features))




def parseTable(table_file):
"""

"""
return(pd.read_csv(table_file, sep ='\t'))


def parseMatrixRegions(regions):
all_regions = []
for group in regions:
Expand All @@ -155,6 +163,7 @@ def parseMatrixRegions(regions):
all_regions.append([region[0],start, end, region[2], region[3], region[4], region[5]])
return all_regions


def __update_parameters(hm,length):
"""
Updates matrix parameters
Expand All @@ -168,6 +177,7 @@ def __update_parameters(hm,length):
hm.parameters['ref point'].append(None)
hm.parameters['bin size'].append(10)


def update_matrix_values(peaks, tables,features, IdColumn,hm):
"""
The function is used for the one to one mapping between the name of the enriched regions
Expand All @@ -186,6 +196,7 @@ def update_matrix_values(peaks, tables,features, IdColumn,hm):
hm.matrix.sample_boundaries = hm.matrix.sample_boundaries +[x+1+current_last_col for x in range(len(tables)*len(features))]
__update_parameters(hm,len(tables)*len(features))


def __read_tables_columns(tables, features):
for table in tables:
df = parseTable(table)
Expand Down