-
Notifications
You must be signed in to change notification settings - Fork 4
/
25_cleanup_duplicate_donors_by_variant_overlap.py
executable file
·203 lines (166 loc) · 7.08 KB
/
25_cleanup_duplicate_donors_by_variant_overlap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#! /usr/bin/python3
#
# This source code is part of icgc, an ICGC processing pipeline.
#
# Icgc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Icgc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see<http://www.gnu.org/licenses/>.
#
# Contact: ivana.mihalek@gmail.com
#
from config import Config
from icgc_utils.common_queries import *
from icgc_utils.utils import *
from icgc_utils.processes import *
def find_duplicates(cursor, somatic_table, diagnostics=False):
duplicates = []
tumor_short = somatic_table.split("_")[0]
##################
outf = None
if diagnostics: outf = open("%s.duplicate_donors.tsv"%tumor_short, "w")
if outf: outf.write("\t".join(['tumor', 'donor 1', 'donor 2', 'vars 1', 'vars 2', 'common vars', 'pct 1', 'pct2'])+"\n")
##################
# we are looking for suspiciously large overlaps in the mutation sets
# first find the mutation set associated with each mutation id
qry = "select icgc_donor_id, group_concat(icgc_mutation_id) from %s " % somatic_table
qry += "group by icgc_donor_id"
ret = search_db(cursor, qry, verbose=False)
if not ret: return None
mutations = {}
for icgc_donor_id, mutstr in ret:
mutations[icgc_donor_id] = set(mutstr.split(","))
# sanity - the only information that we have available that might eliminate
# a pair as potential duplicate is the sex of the donor
sex = {}
qry = "select icgc_donor_id, donor_sex from %s_donor " % tumor_short
ret = search_db(cursor, qry, verbose=False)
if ret: sex = dict(ret)
# now consider each pair of ids and the overlap between their mutation sets
donor = list(mutations.keys())
for i in range(len(mutations)):
mi = mutations[donor[i]]
li = len(mi)
sex_i = sex[donor[i]] if donor[i] in sex else None
for j in range(i+1,len(mutations)):
mj = mutations[donor[j]]
lj = len(mj)
sex_j = sex[donor[j]] if donor[j] in sex else None
if sex_i and sex_j and sex_i.lower()!=sex_j.lower(): continue
common_vars = mi.intersection(mj)
lv = len(common_vars)
if lv<5: continue
pcti = float(lv)/li
pctj = float(lv)/lj
if pcti<0.4 and pctj<0.4: continue
#print("\t".join([str(i) for i in [tumor_short, donor[i], donor[j], li, lj, lv]]) )
if outf:
outfields = [tumor_short, donor[i], donor[j], li, lj, lv, "%d"%(100*pcti), "%d"%(100*pctj)]
outf.write("\t".join([str(i) for i in outfields])+"\n" )
duplicates.append([donor[i], donor[j]])
if outf: outf.close()
return duplicates
#########################################
def resolve(cursor, somatic_table, cluster):
tumor_short = somatic_table.split("_")[0]
# by this point is should have resolved duplicate specimens, but let's check again
qry = "select icgc_donor_id, group_concat(distinct(icgc_specimen_id)) from %s " % somatic_table
qry += "where icgc_donor_id in (%s) " % ",".join([quotify(d) for d in cluster])
qry += "group by icgc_donor_id"
ret = search_db(cursor, qry, verbose=False)
specimen_id = dict(ret)
for icgc_donor_id, spid in specimen_id.items():
if "," in spid:
print("{}: multiple specimen ids ({}) for {}".format(somatic_table, spid, icgc_donor_id))
exit()
# specimen types?
qry = "select icgc_donor_id, specimen_type from %s_specimen " % tumor_short
qry += "where icgc_specimen_id in (%s)" % ",".join([quotify(d) for d in specimen_id.values()])
ret = search_db(cursor, qry)
specimen_type = dict(ret)
# total number of variants, depth of sequencing?
qry = "select icgc_donor_id, count(*), ifnull(avg(total_read_count),0) from %s " % somatic_table
qry += "where icgc_donor_id in (%s) group by icgc_donor_id" % ",".join([quotify(d) for d in cluster])
ret = search_db(cursor, qry)
number_of_variants = dict([r[:2] for r in ret])
avg_depth = dict([[r[0],r[2]] for r in ret])
print("\ncluster")
for donor_id in cluster:
print(" %10s %10d %10.2f %s" %(donor_id, number_of_variants[donor_id], avg_depth[donor_id], specimen_type[donor_id]))
print("end cluster")
primary_tumors = set([donor_id for donor_id in cluster if 'primary' in specimen_type[donor_id].lower()[:len('primary')]])
metastatic_tumors = set([donor_id for donor_id in cluster if 'metastatic' in specimen_type[donor_id].lower()[:len('metastatic')]])
normal_specimens = set([donor_id for donor_id in cluster if 'normal' in specimen_type[donor_id].lower()[0:len('normal')]])
other = cluster.difference(primary_tumors).difference(metastatic_tumors).difference(normal_specimens)
preferred_set = other
for tumor_type_set in [primary_tumors, metastatic_tumors]:
if len(tumor_type_set)>0:
preferred_set = tumor_type_set
break
if len(preferred_set)==0: return # not sure how this could happen
keep_id = None
if len(preferred_set)==1:
keep_id = preferred_set.pop()
else:
# this is arbitrary if all depths are 0
keep_id = sorted(preferred_set,key=lambda x: avg_depth[x], reverse=True)[0]
cluster.remove(keep_id)
print("*** keeping", keep_id)
print("### dropping", cluster)
remove_string = ",".join([quotify(d) for d in cluster])
qry = "delete from %s where icgc_donor_id in (%s)" % (somatic_table, remove_string)
ret = search_db(cursor, qry)
if ret:
search_db(cursor, qry, verbose=True)
exit()
print(qry)
return
#########################################
def cleanup(tables, other_args):
db = connect_to_mysql(Config.mysql_conf_file)
cursor = db.cursor()
switch_to_db(cursor,"icgc")
# group concat default length is 1024 bytes which is kinda smallish
qry = "set session group_concat_max_len = 20000000"
search_db(cursor, qry)
for somatic_table in tables:
print("\n==========\n%s"%somatic_table)
duplicates = find_duplicates(cursor, somatic_table, diagnostics=False)
if not duplicates or len(duplicates)==0: continue
clusters_of_duplicates = find_clusters(duplicates)
for cluster in clusters_of_duplicates:
resolve(cursor, somatic_table, cluster)
cursor.close()
db.close()
#########################################
#########################################
def main():
print("disabled ") # comment out to run
exit(1)
db = connect_to_mysql(Config.mysql_conf_file)
cursor = db.cursor()
#########################
# which somatic tables do we have
qry = "select table_name from information_schema.tables "
qry += "where table_schema='icgc' and table_name like '%_simple_somatic'"
tables = [field[0] for field in search_db(cursor,qry)]
table_size = get_table_size(cursor, 'icgc', tables)
cursor.close()
db.close()
tables_sorted = sorted(tables, key=lambda t: table_size[t], reverse=False)
number_of_chunks = 1
#tables_sorted = ['GBM_simple_somatic']
#number_of_chunks = 1
parallelize(number_of_chunks, cleanup, tables_sorted, [], round_robin=True)
return
#########################################
if __name__ == '__main__':
main()