-
Notifications
You must be signed in to change notification settings - Fork 608
/
Copy pathcoverage_hist.py
146 lines (126 loc) · 5.19 KB
/
coverage_hist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import logging
import re
from collections import defaultdict
from multiqc.modules.base_module import BaseMultiqcModule
from multiqc.modules.qualimap.QM_BamQC import coverage_histogram_helptext, genome_fraction_helptext
from multiqc.plots import linegraph
# Initialise the logger
log = logging.getLogger(__name__)
class DragenCoverageHist(BaseMultiqcModule):
def add_coverage_hist(self):
data_by_phenotype_by_sample = defaultdict(dict)
for f in self.find_log_files("dragen/wgs_fine_hist"):
s_name, data_by_phenotype = parse_wgs_fine_hist(f)
s_name = self.clean_s_name(s_name, f)
if s_name in data_by_phenotype_by_sample:
log.debug(f"Duplicate sample name found! Overwriting: {s_name}")
self.add_data_source(f, section="stats")
data_by_phenotype_by_sample[s_name].update(data_by_phenotype)
# Filter to strip out ignored sample names:
data_by_phenotype_by_sample = self.ignore_samples(data_by_phenotype_by_sample)
# Merge tumor and normal data:
data_by_sample = defaultdict(dict)
for sn in data_by_phenotype_by_sample:
for phenotype in data_by_phenotype_by_sample[sn]:
new_sn = sn
if phenotype == "normal":
new_sn = sn + "_normal"
data_by_sample[new_sn] = data_by_phenotype_by_sample[sn][phenotype]
if not data_by_sample:
return set()
# Only plot data, don't want to write this to a file
# (can do so with --export-plots already)
# self.write_data_file(data_by_sample, "dragen_cov_hist")
dist_data = {sn: dist for sn, (dist, cum, depth_1pc) in data_by_sample.items()}
cum_data = {sn: cum for sn, (dist, cum, depth_1pc) in data_by_sample.items()}
depth_1pc = max(depth_1pc for (dist, cum, depth_1pc) in data_by_sample.values())
self.add_section(
name="Coverage distribution",
anchor="dragen-coverage-distribution",
description="Number of locations in the reference genome with a given depth of coverage.",
helptext=coverage_histogram_helptext,
plot=linegraph.plot(
dist_data,
{
"id": "dragen_coverage_dist",
"title": "Dragen: Coverage distribution",
"xlab": "Depth (x)",
"ylab": "Number of bases in genome covered by X reads",
"ymin": 0,
"xmin": 0,
"xmax": depth_1pc, # trim long flat tail
"tt_label": "<b>{point.x}X</b>: {point.y} loci",
"cpswitch": True,
},
),
)
self.add_section(
name="Cumulative coverage hist",
anchor="dragen-cum-coverage-histogram",
description="Number of locations in the reference genome with at least given depth of coverage.",
helptext=genome_fraction_helptext,
plot=linegraph.plot(
cum_data,
{
"id": "dragen_cumulative_coverage_hist",
"title": "Dragen: Cumulative coverage hist",
"xlab": "Depth (x)",
"ylab": "% of bases in genome covered by at least X reads",
"ymin": 0,
"ymax": 100,
"xmin": 0,
"xmax": depth_1pc, # trim long flat tail
"tt_label": "<b>{point.x}X</b>: {point.y:.2f}%",
},
),
)
return data_by_sample.keys()
def parse_wgs_fine_hist(f):
"""
T_SRR7890936_50pc.wgs_fine_hist_normal.csv
T_SRR7890936_50pc.wgs_fine_hist_tumor.csv
Depth,Overall
0,104231614
1,9430586
2,5546235
...
998,208
999,177
1000+,201801
Contains two columns: Depth and Overall. The value in the Depth column ranges from 0 to 1000+
and the Overall column indicates the number of loci covered at the corresponding depth.
Parsing all values except for 1000+ and plotting a distribution histogram and cumulative histogram
"""
# first pass to calculate total number of bases to calculate percentages
parsed_data = dict()
for line in f["f"].splitlines():
if line.startswith("Depth,Overall"):
continue
key, cnt = line.split(",")
try:
cnt = int(cnt)
except ValueError:
continue
parsed_data[key] = cnt
total_cnt = sum(parsed_data.values())
data = dict()
cum_data = dict()
cum_cnt = 0
depth_1pc = None
for key, cnt in reversed(list(parsed_data.items())):
try:
depth = int(key)
except ValueError:
continue
cum_cnt += cnt
if total_cnt > 0:
cum_pct = cum_cnt / total_cnt * 100.0
else:
cum_pct = 0
if cum_pct < 1: # to trim long flat tail
depth_1pc = depth
data[depth] = cnt
cum_data[depth] = cum_pct
m = re.search(r"(.*).wgs_fine_hist_?(tumor|normal)?.csv", f["fn"])
sample, phenotype = m.group(1), m.group(2)
return sample, {phenotype: (data, cum_data, depth_1pc)}