-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_test_randomisation.R
142 lines (132 loc) · 5.33 KB
/
2_test_randomisation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# 2_test_randomisation.R
# graphical test of overall randomisation, using my code for fraud test
# relies on data being in the `tab1` format
# April 2022
library(nimble) # for Bayesian model
# source the fraud functions
source('C:\\Users\\barnetta\\OneDrive - Queensland University of Technology\\fraud\\baseline_tables\\baseline\\99_functions.R')
## arrange summary data to match fraud functions
# continuous
stats1 = data.frame(N1 = tab1$ContTable$`New model of care`[,1],
miss1 = tab1$ContTable$`New model of care`[,2],
m1 = tab1$ContTable$`New model of care`[,4],
sd1 = tab1$ContTable$`New model of care`[,5]) %>%
mutate(n1 = N1 - miss1) %>%
tibble::rownames_to_column(var = "v1")
stats2 = data.frame(N2 = tab1$ContTable$`Usual care`[,1],
miss2 = tab1$ContTable$`Usual care`[,2],
m2 = tab1$ContTable$`Usual care`[,4],
sd2 = tab1$ContTable$`Usual care`[,5]) %>%
mutate(n2 = N2 - miss2) %>%
tibble::rownames_to_column(var = "v1")
continuous = full_join(stats1, stats2, by='v1')
## categorical
# need to loop through variables as they are in a list
variables = names(tab1$CatTable$`New model of care`)
cat1 = cat2 = NULL
for (v in variables){
this_frame1 = tab1$CatTable$`New model of care`[[v]]
this_frame1 = this_frame1[-1,] # remove reference row
to_process1 = mutate(this_frame1, v1 = paste(v, level)) %>%
select(v1, n, miss, freq) %>%
mutate(freq = as.numeric(freq))
cat1 = bind_rows(cat1, to_process1)
#
this_frame2 = tab1$CatTable$`Usual care`[[v]]
this_frame2 = this_frame2[-1,] # remove reference row
to_process2 = mutate(this_frame2, v1 = paste(v, level)) %>%
select(v1, n, miss, freq)%>%
mutate(freq = as.numeric(freq))
cat2 = bind_rows(cat2, to_process2)
}
cat1 = mutate(cat1, N1 = n - miss) %>%
rename('n1' = 'freq') %>%
select(v1, n1, N1)
cat2 = mutate(cat2, N2 = n - miss) %>%
rename('n2' = 'freq') %>%
select(v1, n2, N2)
cat = full_join(cat1, cat2, by='v1')
# now get t-statistics
table_data = list()
table_data$continuous = continuous
table_data$percents = cat
tstats.p = t.stats.percents(indata = table_data$percents)
tstats.c = t.stats.continuous(indata = table_data$continuous)
# 2) get t-statistics for both statistics types
tstats.c = tstats.p = NULL
if(is.null(table_data$continuous) == FALSE){
tstats.c = t.stats.continuous(indata = table_data$continuous)
}
if(is.null(table_data$percents) == FALSE){
tstats.p = t.stats.percents(indata = table_data$percents)
}
#
tstats = bind_rows(tstats.c, tstats.p, .id = 'statistic') %>%
mutate(study = 1) # dummy study number
# 3) make simulated data
n.sims = 100
for (k in 1:n.sims){
tstats.sim = make_sim(table_data)
# get t-statistics for both statistics types
tstats.c = tstats.p = NULL
if(is.null(table_data$continuous) == FALSE){tstats.c = t.stats.continuous(indata = tstats.sim$continuous)}
if(is.null(table_data$percents) == FALSE){tstats.p = t.stats.percents(indata = tstats.sim$percents)}
tstats.sim = bind_rows(tstats.c, tstats.p, .id = 'statistic') %>%
mutate(study = k+1) # dummy study number
tstats = bind_rows(tstats, tstats.sim) # add to overall data
}
# summary(tstats) # quick check
# 4) run Bayesian model
outfile = 'bayes_test_random.RData'
exists = length(dir('results', outfile)) > 0 # to save time, check if results exist already
if(exists == FALSE){
results = run_bayes_test(in_data = tstats)
save(results, file=paste('results/', outfile, sep='')) # to save time
}
if(exists == TRUE){
load(paste('results/', outfile, sep=''))
}
# 5) draw the CDF
## first create a median
# create all CDFs
average = filter(tstats, study > 1) %>% # just simulations
group_by(study) %>%
mutate(cdf = ecdf(t)(t)) %>% # CDF per study
ungroup()
# now calculate median CDF
cdf_median = group_by(average, study) %>%
arrange(study, t) %>%
mutate(r = rank(t, ties.method = 'first')) %>%
group_by(r) %>%
summarise(mid = median(t)) %>%
ungroup() %>%
mutate(e = r/n(),
study = 1) # had to provide study number
# add first point of the CDF
cdf_first = filter(cdf_median, r==1) %>%
mutate(e = 0, r=0)
cdf_median = bind_rows(cdf_first, cdf_median)
# move trial line to last
tstats = mutate(tstats, study = ifelse(study==1, 999, study))
colours = grey(runif(n = n.sims + 2, min=0.5, max=0.9)) # grey colours for simulations
colours[1] = 'dodgerblue' # colour for median
colours[n.sims + 2] = 'indianred1' # colour for trial
sizes = rep(1, n.sims + 2)
sizes[c(1,n.sims + 2)] = 2 # median and trial are larger
# plot
tplot = ggplot(data=tstats, aes(x=t, size=factor(study), colour=factor(study))) +
theme_bw()+
scale_size_manual(values = sizes)+
scale_color_manual(values = colours)+
stat_ecdf()+
geom_step(data=cdf_median, aes(x=mid, y=e))+ # simulation average CDF
xlab('t-statistic')+
ylab('Cumulative density')+
theme(legend.position = 'none',
panel.grid.minor = element_blank())+
geom_segment(aes(x = 1.5, y = 0.5, xend = 0.9, yend = 0.6), colour='black', lwd=1, arrow = arrow(length = unit(0.03, "npc"))) # to highlight flat region
# export image
filename = 'figures/cumulative_randomisation.jpg'
jpeg(filename, width=5, height=4, units='in', res=400, quality=100)
print(tplot)
dev.off()