-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathalgorithms.py
228 lines (185 loc) · 8.79 KB
/
algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import os
import math
import pickle
import subprocess
from collections import deque
from datetime import datetime
import networkx as nx
import numpy as np
import dgl
import torch
def assert_strand(graph, walk):
org_strand = graph.ndata['read_strand'][walk[0]].item()
for idx, node in enumerate(walk[1:]):
curr_strand = graph.ndata['read_strand'][node].item()
if curr_strand != org_strand:
print('-' * 20)
print(f'walk index: {idx}')
print(f'node index: {node}')
def assert_chromosome(graph, walk):
org_chr = graph.ndata['read_chr'][walk[0]].item()
for idx, node in enumerate(walk[1:]):
curr_chr = graph.ndata['read_chr'][node].item()
if curr_chr != org_chr:
print('-' * 20)
print(f'walk index: {idx}')
print(f'node index: {node}')
def assert_overlap(graph, walk):
for idx, (src, dst) in enumerate(zip(walk[:-1], walk[1:])):
src_start = graph.ndata['read_start'][src].item()
dst_start = graph.ndata['read_start'][dst].item()
src_end = graph.ndata['read_end'][src].item()
dst_end = graph.ndata['read_end'][dst].item()
src_strand = graph.ndata['read_strand'][src].item()
dst_strand = graph.ndata['read_strand'][dst].item()
if src_strand == dst_strand == 1 and dst_start > src_end:
print('-' * 20)
print(f'walk index: {idx}')
print(f'nodes not connected: {src}, {dst}')
print(f'end: {src_end}, start: {dst_start}')
if src_strand == dst_strand == -1 and dst_end < src_start:
print('-' * 20)
print(f'walk index: {idx}')
print(f'nodes not connected: {src}, {dst}')
print(f'end: {src_start}, start: {dst_end}')
def interval_union(name, root):
graph = dgl.load_graphs(f'{root}/processed/{name}.dgl')[0][0]
intervals = []
for strand, start, end in zip(graph.ndata['read_strand'], graph.ndata['read_start'], graph.ndata['read_end']):
if strand.item() == 1:
intervals.append([start.item(), end.item()])
intervals.sort(key=lambda x: x[0])
result = [intervals[0]]
for interval in intervals[1:]:
if interval[0] <= result[-1][1]:
result[-1][1] = max(result[-1][1], interval[1])
else:
result.append(interval)
return result
def get_gt_for_single_strand(graph, read_start_dict, read_end_dict, positive=False):
# New version
# components = [] # not for gt (later used)
all_nodes = graph.nodes()
gt_edges = set()
if positive:
final_node = max(all_nodes, key=lambda x: read_end_dict[x])
highest_node_reached = min(all_nodes, key=lambda x: read_end_dict[x])
else:
final_node = min(all_nodes, key=lambda x: read_start_dict[x])
highest_node_reached = max(all_nodes, key=lambda x: read_start_dict[x])
while all_nodes:
if positive:
start_node = min(all_nodes, key=lambda x: read_start_dict[x])
else:
start_node = max(all_nodes, key=lambda x: read_end_dict[x])
# try finding a path and report the highest found node during the dfs
current_graph = graph.subgraph(all_nodes)
full_component = set(nx.dfs_postorder_nodes(current_graph, source=start_node))
full_component.add(start_node)
if positive:
highest_node_in_component = max(full_component, key=lambda x: read_end_dict[x])
else:
highest_node_in_component = min(full_component, key=lambda x: read_start_dict[x])
current_graph = graph.subgraph(full_component)
component = set(nx.dfs_postorder_nodes(current_graph.reverse(copy=True), source=highest_node_in_component))
component.add(highest_node_in_component)
current_graph = graph.subgraph(component)
# if the path doesnt go further then an already existing chunk - dont add any edges to gt
not_reached_highest = (positive and (
read_end_dict[highest_node_in_component] < read_end_dict[highest_node_reached])) \
or (not positive and (
read_start_dict[highest_node_in_component] > read_start_dict[highest_node_reached]))
if len(component) < 2 or not_reached_highest: # Used to be len(component) <= 2
all_nodes = all_nodes - full_component
continue
else:
highest_node_reached = highest_node_in_component
gt_edges = set(current_graph.edges()) | gt_edges
# print("finish component")
if highest_node_reached == final_node:
break
all_nodes = all_nodes - full_component
return gt_edges
def create_correct_graphs(graph, read_start_dict, read_end_dict, read_strand_dict, read_chr_dict):
# New version
# only real connections of true overlaps
pos_edges = []
neg_edges = []
for edge in graph.edges():
src, dst = edge
if read_start_dict[dst] < read_end_dict[src] and read_start_dict[dst] > read_start_dict[src]:
if read_strand_dict[src] == 1 and read_strand_dict[dst] == 1 and read_chr_dict[src] == read_chr_dict[dst]:
pos_edges.append(edge)
if read_start_dict[src] < read_end_dict[dst] and read_start_dict[src] > read_start_dict[dst]:
if read_strand_dict[src] == -1 and read_strand_dict[dst] == -1 and read_chr_dict[src] == read_chr_dict[dst]:
neg_edges.append(edge)
pos_graph = nx.DiGraph()
pos_graph.add_edges_from(pos_edges)
neg_graph = nx.DiGraph()
neg_graph.add_edges_from(neg_edges)
return pos_graph, neg_graph
def create_correct_graphs_combo(graph, read_start_dict, read_end_dict, read_strand_dict, read_chr_dict):
# New version
# only real connections of true overlaps
unique_chr = set([v.item() for k, v in read_chr_dict.items()])
pos_edges = {chr: [] for chr in unique_chr}
neg_edges = {chr: [] for chr in unique_chr}
pos_graphs = {}
neg_graphs = {}
for edge in graph.edges():
src, dst = edge
if read_start_dict[dst] < read_end_dict[src] and read_start_dict[dst] > read_start_dict[src]:
if read_strand_dict[src] == 1 and read_strand_dict[dst] == 1 and read_chr_dict[src] == read_chr_dict[dst]:
pos_edges[read_chr_dict[src].item()].append(edge)
if read_start_dict[src] < read_end_dict[dst] and read_start_dict[src] > read_start_dict[dst]:
if read_strand_dict[src] == -1 and read_strand_dict[dst] == -1 and read_chr_dict[src] == read_chr_dict[dst]:
neg_edges[read_chr_dict[src].item()].append(edge)
for chr in unique_chr:
pos_graph = nx.DiGraph()
pos_graph.add_edges_from(pos_edges[chr])
pos_graphs[chr] = pos_graph
neg_graph = nx.DiGraph()
neg_graph.add_edges_from(neg_edges[chr])
neg_graphs[chr] = neg_graph
return pos_graphs, neg_graphs
def process_graph(graph):
# New version
read_start_dict = nx.get_node_attributes(graph, 'read_start')
read_end_dict = nx.get_node_attributes(graph, 'read_end')
read_strand_dict = nx.get_node_attributes(graph, 'read_strand')
read_chr_dict = nx.get_node_attributes(graph, 'read_chr')
pos_graph, neg_graph = create_correct_graphs(graph, read_start_dict, read_end_dict, read_strand_dict, read_chr_dict)
pos_gt_edges = get_gt_for_single_strand(pos_graph, read_start_dict, read_end_dict, positive=True)
neg_gt_edges = get_gt_for_single_strand(neg_graph, read_start_dict, read_end_dict, positive=False)
gt_edges = neg_gt_edges | pos_gt_edges
gt_dict = {}
for e in graph.edges():
if e in gt_edges:
gt_dict[e] = 1.
else:
gt_dict[e] = 0.
return gt_edges, gt_dict
def process_graph_combo(graph):
# New version
read_start_dict = nx.get_node_attributes(graph, 'read_start')
read_end_dict = nx.get_node_attributes(graph, 'read_end')
read_strand_dict = nx.get_node_attributes(graph, 'read_strand')
read_chr_dict = nx.get_node_attributes(graph, 'read_chr')
print(f'Finding correct graphs per chromosome and strand...')
pos_graphs, neg_graphs = create_correct_graphs_combo(graph, read_start_dict, read_end_dict, read_strand_dict, read_chr_dict)
print(f'Chromosomes found: {len(pos_graphs)}')
gt_edges = set()
for chr, pos_graph in pos_graphs.items():
print(f'Processing chr{chr}...')
pos_gt_edges = get_gt_for_single_strand(pos_graph, read_start_dict, read_end_dict, positive=True)
gt_edges |= pos_gt_edges
for chr, neg_graph in neg_graphs.items():
neg_gt_edges = get_gt_for_single_strand(neg_graph, read_start_dict, read_end_dict, positive=False)
gt_edges |= neg_gt_edges
gt_dict = {}
for e in graph.edges():
if e in gt_edges:
gt_dict[e] = 1.
else:
gt_dict[e] = 0.
return gt_edges, gt_dict