-
Notifications
You must be signed in to change notification settings - Fork 5
/
collapse.Rmd
161 lines (135 loc) · 4.82 KB
/
collapse.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
---
title: "Collapsing bindingDB compound-gene relationships"
output:
html_document:
theme: cosmo
highlight: pygments
---
```{r, message=FALSE}
library(dplyr)
library(ggplot2)
library(DT)
library(scales)
library(readr)
options(stringsAsFactors=FALSE)
```
```{r}
# Read bindingdb and remove non-human interactions
binding.db <- file.path('data', 'binding.tsv.gz') %>%
readr::read_tsv() %>%
dplyr::filter(organism == 'Homo sapiens') %>%
dplyr::filter(! is.na(affinity_nM)) %>%
dplyr::mutate(
source=plyr::mapvalues(source, c('Curated from the literature by BindingDB'), c('BindingDB'))
)
# View a subset of the data.frame
binding.db %>% dplyr::sample_n(200) %>% dplyr::select(-c(pubmed, doi)) %>% DT::datatable()
```
```{r}
# Read the drugbank to bindingDB fuzzy mappings produced using UniChem
# Restrict to compounds in drugbank
joined.df <- 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/bindingdb.tsv' %>%
readr::read_tsv() %>%
dplyr::inner_join(binding.db)
```
`r nrow(joined.df)` compound--protein binding measurements are extracted for humans when restricting to DrugBank-mapped compounds.
```{r}
geom.mean <- function(x) {
# Returns the geometric mean
exp(mean(log(x)))
}
ResolveAffinity <- function(df) {
# Preferentially selects the affinity measure. If multiple meansurements
# exist for the same compound-protein pair, the geometric mean is taken.
for (measure in c('Kd', 'Ki', 'IC50')) {
if (is.element(measure, df$measure)) {
measure.df <- df[df$measure == measure, ]
return.df <- data.frame(
measure = measure,
affinity_nM = round(geom.mean(measure.df$affinity_nM), 5),
n_measures = nrow(measure.df),
sources = paste(unique(na.omit(measure.df$source)), collapse=','),
pubmeds = paste(unique(na.omit(measure.df$pubmed)), collapse=','))
return(return.df)
}
}
}
# Create a single affinity measure for each compound-protein pair
collapse.df <- joined.df %>%
dplyr::group_by(drugbank_id, bindingdb_id, uniprot, entrez_gene) %>%
dplyr::do(ResolveAffinity(.)) %>%
dplyr::ungroup()
collapse.df %>%
readr::write_tsv('data/bindings-drugbank-collapsed.tsv')
# View a subset of the data.frame
collapse.df %>% dplyr::sample_n(200) %>% DT::datatable()
```
`r nrow(collapse.df)` compound--protein pairs were assayed.
```{r}
drugbank.df <- 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv' %>%
readr::read_tsv() %>%
dplyr::mutate(drugbank_approved = as.integer(grepl('approved', groups))) %>%
dplyr::transmute(drugbank_id, drugbank_name = name, drugbank_approved)
entrez.df <- 'https://raw.githubusercontent.com/dhimmel/entrez-gene/5352b31e04ec136e99d25a0ba63e8867aa71b69f/data/genes-human.tsv' %>%
readr::read_tsv() %>%
dplyr::transmute(entrez_gene = GeneID, gene_symbol = Symbol)
gene.df <- collapse.df %>%
dplyr::group_by(drugbank_id, entrez_gene) %>%
dplyr::summarize(
affinity_nM = min(affinity_nM),
n_pairs = n(),
sources = paste(unique(sources), collapse=','),
pubmeds = paste(unique(pubmeds), collapse=',')
) %>%
dplyr::ungroup() %>%
dplyr::left_join(drugbank.df) %>%
dplyr::left_join(entrez.df)
gene.df %>%
readr::write_tsv('data/bindings-drugbank-gene.tsv')
# View bindings for approved drugs
gene.df %>%
dplyr::filter(affinity_nM <= 1000) %>%
dplyr::filter(drugbank_approved == 1) %>%
dplyr::select(drugbank_name, gene_symbol, affinity_nM, n_pairs) %>%
DT::datatable()
```
`r nrow(gene.df)` drugbank--gene pairs have measured binding affinities.
### Interaction retention based on affinity threshold
```{r, fig.width=8}
exp.range <- -5:11
gene.df %>%
ggplot(aes(x = affinity_nM)) +
geom_histogram(alpha = 0.6) +
scale_x_log10(
breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
labels = scales::trans_format("log10", math_format(10^.x))) +
theme_bw()
gene.df %>%
ggplot(aes(x = affinity_nM)) +
stat_ecdf() +
scale_x_log10(
breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
labels = scales::trans_format("log10", math_format(10^.x))) +
theme_bw()
```
### Interactions per compound and per gene when restricting to micromolar or stronger affinities.
```{r, fig.width=8}
gene.df %>%
dplyr::filter(affinity_nM <= 1000) %>%
dplyr::group_by(drugbank_id) %>%
dplyr::summarize(n_genes = n()) %>%
ggplot(aes(x=n_genes)) +
geom_histogram(alpha=0.6) +
scale_x_log10(breaks=c(1:3, 5, 10, 20, 50, 100)) +
xlab('Genes bound per compound') +
theme_bw()
gene.df %>%
dplyr::filter(affinity_nM <= 1000) %>%
dplyr::group_by(entrez_gene) %>%
dplyr::summarize(n_compounds = n()) %>%
ggplot(aes(x=n_compounds)) +
geom_histogram(alpha=0.6) +
scale_x_log10(breaks=c(1:5, 7, 10, 15, 25, 50)) +
xlab('Compounds binding per gene') +
theme_bw()
```