-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPlot and analysis functions.R
executable file
·578 lines (481 loc) · 39.7 KB
/
Plot and analysis functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
# This function is used to call Pubmed from R. In this project, I just use it to look at the Abstracts as a quality control step.
# If clean.up is FALSE, it just returns the entire PubMed record in a big, confusing list of lists. Useful for bug-checks etc
# If TRUE, it extracts only the data that we want and returns it as a nice data frame
# If save.abstracts=TRUE, we take the entire abstract text (or the first-listed chunk of text, for abstracts split over multiple fields)
# Else, we jsut count the number of characters in the abstract
pubmed_fetch <- function(ids, clean.up = T, save.abstracts = F, save.titles = F){
require(RCurl)
require(XML)
# Internal function that gets all the data that we want out of a pubmed record
retrieve.data <- function(record, save.abstracts, save.titles){
forenames <- get.first.names(record) # Get the first names using the get.first.names function
record <- unlist(record) # Now unlist the record
field <- names(record) # Store the names of each Pubmed field
record <- as.character(record) # Ensure the record is a character not a factor (needed for string matching)
# Prepare the output data frame. It will contain NAs unless we can find the data to fill each field in the PubMed record
output <- data.frame(id=NA, doi=NA, date=NA, journal = NA, type = NA, abstract = NA)
id <- record[grep("PMID.text", field)] # Try to get the PubMed ID
if(length(id) >= 1) output$id <- id[1] # If there is one, add it to the output
# Get the date, and store it in this format: 15_01_2012
date <- paste0(c(record[field == "MedlineCitation.DateCreated.Day"], record[field == "MedlineCitation.DateCreated.Month"], record[field == "MedlineCitation.DateCreated.Year"]), collapse = "_")
if(length(date) >= 1) output$date <- date[1]
journal <- record[field == "MedlineCitation.Article.Journal.ISOAbbreviation"] # Get the journal
if(length(journal) == 1) output$journal <- journal
# Get the publication type list
type <- record[grep("PublicationTypeList.PublicationType", field)]
if(length(type) == 1) output$type <- type
else if(length(type) > 1) output$type <- paste0(type, collapse = "_") # If there are multiples, collapse to a single character string with underscores
abstract <- record[grep("Abstract.AbstractText", field)] # Get the entire abstract(s)
if(length(abstract) >= 1 & !(save.abstracts)) output$abstract <- max(nchar(abstract, keepNA = T)) # Save the length of the abstract (or the longest abstract chunk, for papers with multi-part abstracts split over multiple fields). Discarding the abstracts makes the files much smaller (MB rather than GB)
else if (length(abstract) >= 1 & save.abstracts) output$abstract <- paste0(abstract, collapse = "___")
if(save.titles) output$title <- record[grep("Article.ArticleTitle", field)]
# Searching for the DOI is slightly hard, because annoyingly there is no obvious, consistent 'doi' field in PubMed!
# Here, I define a DOI is defined as a string that contains "10." followed by at least one digit, then a slash, then some letters and/or numbers.
to.search.for.doi <- record[c(grep("ArticleIdList", field), grep("ELocationID", field))] # Search only the "ArticleIdList" and "ELocationID" fields
doi <- unique(to.search.for.doi[grep("10[.][[:digit:]]+/[[:alnum:]]", to.search.for.doi)])
if(length(doi) >= 1) output$doi <- doi[1] # If there are multiple possible DOIs, just take the first candidate DOI in the "ArticleIdList" and "ELocationID" fields
return(data.frame(output[,!(names(output) %in% "abstract")], forenames, abstract = output$abstract, stringsAsFactors = F))
}
args <- c(id = paste(ids, collapse = ","), db = "pubmed", rettype = "xml", email = entrez_email, tool = "rpubmed")
url_args <- paste(paste(names(args), args, sep="="), collapse = "&")
base_url <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=full"
url_string <- paste(base_url, url_args, sep = "&")
records <- getURL(url_string)
Sys.sleep(0.33) #NCBI limits requests to three per second
records <- xmlToList(xmlTreeParse(records, useInternalNodes = TRUE))
if(!clean.up) return(records)
output <- do.call("rbind", lapply(records, retrieve.data, save.abstracts=save.abstracts, save.titles=save.titles))
rm(records) # Try to minimise pressure on memory by deleting this as early as we can
rownames(output) <- NULL
return(output)
}
# If type = "since2000", we represent the date at which the paper appeared on Pubmed as number of days since the 1st January 2000, divided by 365
# Else if type = "beforePresent", it's the number of days before 20/08/2016, divided by 365 - this is the date that I downloaded a local copy of the Medline database.
convert.dates <- function(dates, type = "before2016") {
if(type == "since2000") dates <- as.numeric(as.Date(dates, "%d_%m_%Y")) - as.numeric(as.Date("01_01_2000","%d_%m_%Y"))
else if(type == "before2016") dates <- as.numeric(as.Date(dates, "%d_%m_%Y")) - as.numeric(as.Date("20_08_2016","%d_%m_%Y"))
return(dates / 365.25) # Convert from days to years
}
# Function to convert a log odds ratio to a probability
# toProb <- function(x) exp(as.numeric(x)) / (1 + exp(as.numeric(x)))
# Properly capitalise country names
capitalise.countries <- function(countries){
countries <- as.character(sapply(as.character(countries), function(x) {s <- strsplit(x, " ")[[1]]; paste(toupper(substring(s, 1,1)), substring(s, 2), sep="", collapse=" ")}))
countries <- str_replace(countries, " And ", " and ")
str_replace(countries, "Usa", "USA")
}
# Used to predict the gender ratio (as proportion), given a date t, and a value for the two shape parameters r and c
pfunc <- function(t, r, c){exp(0.5*r*t)/(2*exp(0.5*r*t)+c)} # function for a sigmoid curve that converges to 0 or 0.5 at +ve infinity, and 0.5 or 0 at -ve infinity
# Get the estimated gender ratio, its rate of change, and the estimated number of years until gender parity (defined as 45-55% female)
gender.stats <- function(item, filter.type, position, country = "all", data.source, chunk.size = 10, nChunks = 100, plot = T, print.each.one = F, run.sim = F, export.JSON = F, verbose = T){
# Print which item we are doing
if(verbose) print(paste("Doing ", item, " (", position, ")", sep = ""))
############ DEFINE INTERNAL FUNCTIONS
pfunc.deriv <- function(p, r) r*p*(0.5-p) # the first deriviative of the pfunc function (needed to find the rate of change in gender ratio on a specified date)
# log likelihood of data given the parameters, assuming binomial distribution
# returns the negative log-likelihood to be minimised with optim()
find.ll<-function(data, par) suppressWarnings(with(data, -1 * sum(dbinom(x = nFemales, size = n, prob = pfunc(t = date, r = par[1], c = par[2]), log = TRUE))))
# Internal function to perform 1D optimisation and find the parameters r and c that optimises the log likelihood for the focal set of data. Arbitrarily chosen starting values
run.optimiser <- function(data) optim(par = c(0.1, 1), find.ll, data = data)
# Used to find the 95% CIs on a proportion (uses default method of the binom.test function). Works for a pair of columns nFemales and nMales
get.CIs <- function(nFemales.column, nMales.column){
CIs <- data.frame(lower = numeric(length(nFemales.column)), upper = NA)
for(i in 1:length(nFemales.column)){
CIs.foc <- as.numeric(binom.test(nFemales.column[i], nFemales.column[i] + nMales.column[i])$conf.int)
CIs$lowerCI[i] <- CIs.foc[1]
CIs$upperCI[i] <- CIs.foc[2]
}
return(100 * CIs)
}
# Function to export the results to disk in JSON format
run.JSON.export <- function(data.for.web.app=NULL){ # , return.file.name.only = F
if(position == "first") position <- "First"
if(position == "last") position <- "Last"
if(position == "single") position <- "Single"
if(position == "overall") position <- "Overall"
country <- capitalise.countries(country)
if(filter.type == "disc" & country == "All"){
data.for.web.app <- list(Discipline = item, Journal = "allJournals", Country = "allCountries", Position = position, Points = data.for.web.app[[1]], Curve = data.for.web.app[[2]])
file.name <- paste("allJournals", item, position, "allCountries", sep = "_")
}
if(filter.type == "disc" & country != "All"){
data.for.web.app <- list(Discipline = item, Journal = "allJournals", Country = country, Position = position, Points = data.for.web.app[[1]], Curve = data.for.web.app[[2]])
file.name <- paste("allJournals", item, position, country, sep = "_")
}
if(filter.type == "journal" & country == "All"){
data.for.web.app <- list(Discipline = discipline, Journal = item, Country = "allCountries", Position = position, Points = data.for.web.app[[1]], Curve = data.for.web.app[[2]])
file.name <- paste(item, discipline, position, "allCountries", sep = "_")
}
if(filter.type == "journal" & country != "All"){
data.for.web.app <- list(Discipline = discipline, Journal = item, Country = country, Position = position, Points = data.for.web.app[[1]], Curve = data.for.web.app[[2]])
file.name <- paste(item, discipline, position, country, sep = "_")
}
if(filter.type == "everything.for.one.country"){
data.for.web.app <- list(Discipline = "allDisciplines", Journal = "allJournals", Country = country, Position = position, Points = data.for.web.app[[1]], Curve = data.for.web.app[[2]])
file.name <- paste("allJournals", "allDisciplines", position, country, sep = "_")
}
if(filter.type == "everything.for.all.countries"){
data.for.web.app <- list(Discipline = "allDisciplines", Journal = "allJournals", Country = "allCountries", Position = position, Points = data.for.web.app[[1]], Curve = data.for.web.app[[2]])
file.name <- paste("allJournals", "allDisciplines", position, "allCountries", sep = "_")
}
# if(return.file.name.only) return(file.name)
file.name <- paste("../outputs/json files for web app/", file.name, ".json", sep = "")
write(toJSON(data.for.web.app, pretty = T), file.name)
}
############ END OF INTERNAL FUNCTIONS
###### GET THE FOCAL SET OF DATA
if(!(filter.type %in% c("arxiv.cat", "arxiv.subcat"))){ # If we're doing PubMed data...
# Get the data out of the data.frame or database connection using dplyr. Also restrict the data to 2002 onwards, since the data are very sporadic before this
if(filter.type == "disc") real.data <- data.source %>% filter(discipline == item) %>% select(date, country, gender.95) %>% rename(gender = gender.95)
if(filter.type == "journal") real.data <- data.source %>% filter(journal == item) %>% select(date, country, gender.95) %>% rename(gender = gender.95)
if(filter.type %in% c("disc", "journal") & country == "all") real.data <- real.data %>% select(-country)
if(filter.type == "everything.for.all.countries") real.data <- data.source # for when we want to know the change in gender ratio for all of PubMed, irrespective of journal/disc/country!
if(filter.type != "everything.for.one.country") real.data <- real.data %>% collect(n = Inf) %>% as.data.frame %>% mutate(date = convert.dates(date, type = "since2000")) %>% filter(date >= 2)
if(filter.type == "everything.for.one.country") {
if(country == "all") return("check the arguments") # safeguard against accidental massive operation
real.data <- data.source %>% filter(country != "NA" & !(country %in% unique.countries[unique.countries != country])) # discard all papers with no country, and those where country == wrong country
real.data <- real.data %>% select(date, country, gender.95) %>% rename(gender = gender.95) %>% collect(n = Inf) %>% as.data.frame %>% mutate(date = convert.dates(date, type = "since2000")) %>% filter(date >= 2)
real.data <- real.data[grep(country, real.data$country),] # throw out papers where the focal country is not listed
}
}
if(filter.type == "arxiv.cat"){ # for ArXiv data, there is more data from the 1990s than on PubMed. Restrict to 1998 onwards instead of 2002 (1998 has 8119 preprints - drops to 5492 in 1997)
real.data <- data.source %>% filter(category == item) %>% select(date, gender.95) %>% mutate(date = convert.dates(date, type = "since2000")) %>% filter(as.numeric(substr(date, 7, 10)) >= 1998) %>% rename(gender = gender.95)
}
if(filter.type == "arxiv.subcat"){
real.data <- data.source %>% filter(paste(category, sub.category, sep = ": ") == item) %>% select(date, gender.95) %>% mutate(date = convert.dates(date, type = "since2000")) %>% filter(as.numeric(substr(date, 7, 10)) >= 1998) %>% rename(gender = gender.95)
category <- strsplit(item, split = ": ")[[1]][1]
sub.category <- strsplit(item, split = ": ")[[1]][2]
}
# Restrict the data to the specified authorship position
if(position == "single") real.data <- real.data[nchar(real.data$gender) == 1, ] # if doing single, first or last, throw out some of the data
else if(position != "overall"){
real.data <- real.data[nchar(real.data$gender) > 1, ] # restrict to multi-author papers for "first" and "last" author analyses
if(position == "first") real.data$gender <- substr(real.data$gender, 1, 1)
if(position == "last") {
length.author.list <- nchar(real.data$gender)
real.data$gender <- substr(real.data$gender, length.author.list, length.author.list)
}
}
# If specified, trim the data down to a single country as well
if(country != "all"){
real.data <- real.data[str_count("U", real.data$gender) != nchar(real.data$gender), ] # Throw out papers where all authors' genders are unknown
real.data <- real.data[!is.na(real.data$country), ] # Throw out papers where the country is recorded as NA
real.data <- real.data[str_detect(real.data$country, country), ] # Throw out papers where the focal country was not listed anywhere
if(nrow(real.data) == 0) {
# print("There are no data for this country, giving up")
return(NULL) # If there are no data, just end the whole function and return nothing
}
split.country <- strsplit(real.data$country, split = "_") # split the string holding the countries
n.countries <- sapply(split.country, length)
to.keep <- n.countries == 1 | n.countries == nchar(real.data$gender) # To be safe, throw out papers with multiple countries where this number is not the same as number of authors
real.data <- real.data[to.keep, ]
split.country <- split.country[to.keep]; rm(to.keep)
n.countries <- sapply(split.country, length)
if(position != "overall") { # for first/last/single
if(position == "last") real.data$country <- sapply(split.country, function(x) x[length(x)]) # get last listed country
else real.data$country <- sapply(split.country, function(x) x[1]) # get first/single country
real.data <- real.data[str_detect(real.data$country, country), ] # throw out papers where the focal country did not belong to the author we wanted (e.g. first or last)
}
else{ # for "overall"...
multi.country.real.data <- split.country[n.countries > 1]
if(length(multi.country.real.data) > 0){
# Trim off all but the authors that are from the correct country
# We will make the assumption that all the authors from a multi-author paper for which one country is listed are from that country
paperID <- unlist(mapply(rep, 1:length(multi.country.real.data), each = sapply(multi.country.real.data, length)))
multi.country.real.data <- unlist(multi.country.real.data)
keep.author.or.not <- multi.country.real.data == country
multi.country.split.gender <- unlist(strsplit(real.data$gender[n.countries > 1], split = ""))
multi.country.split.gender[unlist(keep.author.or.not)] <- "U" # Set author genders from the wrong country to "U"
real.data$gender[n.countries > 1] <- as.character(tapply(multi.country.split.gender, paperID, function(x) paste0(x, collapse = "")))
}
}
real.data <- real.data %>% select(-country) # we are done with the country column now
}
# Throw out papers with all unknown authors, and count the number of papers remaining
real.data <- real.data[str_count("U", real.data$gender) != nchar(real.data$gender) & !is.na(real.data$gender), ]
n.papers <- nrow(real.data)
################### If there are no data, just end the function and return nothing
if(n.papers == 0) {
# print("There are no data for this item, giving up")
return(NULL)
}
################### Count the number of male and female authors observed on each unique date - we use this data to generate resamples efficiently
real.data <- real.data %>% group_by(date) %>% summarise(F = sum(str_count(gender, "F")), M = sum(str_count(gender, "M"))) %>% as.data.frame %>% gather(gender, count, -date)
probabilities <- with(real.data, count / sum(count))
n.authors <- sum(real.data$count) # The number of M and F authors in the dataset - each resample will consist of this many authors, drawn with replacement
################### If there are too few data, just end the function and return nothing
if(n.papers < 100 | n.authors < 250 | max(real.data$date) - min(real.data$date) < 5) {
# print("There are too few papers, authors, or years to fit a curve, giving up")
return(NULL) # <- note the criteria for attempting to fit a curve
}
################### If doing a journal, we need to also get the discipline for the focal journal from the journals database
if(filter.type == "journal") discipline <- (journals_sqlite %>% filter(short.title == item) %>% select(Discipline) %>% collect() %>% as.data.frame())[,1][1] # note the [1] ensures we get only 1 disc in cases where there are two journals with the same "short.title" listed in journals_sqlite. For example, PubMed lists "Journal of acquired immune deficiency syndromes" and "Journal of acquired immune deficiency syndromes (1999)" as having the same abbreviation, namely "J Acquir Immune Defic Syndr"
################### NOW RESAMPLE THE DATA TO DETERMINE THE 95% CIs on parameters like current gender ratio, rate of change, and years to parity (takes a while, hence option to skip it)
if(run.sim){ # If run.sim = T, then we will be doing lots of time-consuming resampling and exporting the data to make Figures 1 and 2 in the paper
# Function to make "chunk.size" resampled datasets containing n.authors each - this will be done 'n.chunks' times (too big for memory if done in one go)
make.resampled.data <- function(real.data, n.authors, chunk.size){
resampled.data <- sample(1:nrow(real.data), size = chunk.size * n.authors, replace = T, prob = probabilities)
resampled.data <- melt(table(factor(resampled.data, levels = 1:nrow(real.data)), rep(1:chunk.size, each = n.authors))) %>% rename(row = Var1, boot.replicate = Var2, count = value) %>% mutate(row = as.numeric(as.character(row)))
resampled.data <- data.frame(date = real.data$date[resampled.data$row], gender = real.data$gender[resampled.data$row], boot.replicate = resampled.data$boot.replicate, count = resampled.data$count)
resampled.data <- resampled.data %>% group_by(boot.replicate, date) %>% summarise(nFemales = sum(count[gender == "F"]), nMales = sum(count[gender == "M"]), n = nFemales+nMales) %>% as.data.frame %>% filter(n > 0)
resampled.data
}
# This function runs the optimiser on a focal bit of resampled data, and gets the current gender ratio, rate of change,
# and the year at which the ratio got within 5% of gender parity using the pfunc model with optimised parameters
find.response.variables <- function(data){
optim.results <- run.optimiser(data)
r <- optim.results$par[1]
c <- optim.results$par[2]
gender.ratio.at.present <- pfunc(16.63518, r, c) # Compute the predicted gender ratio at the "present", i.e. the day I collected the data, 20/8/16
current.rate.of.change <- pfunc.deriv(p = gender.ratio.at.present, r = r) # Compute the rate of change in the gender ratio at the "present"
year.range <- 0:9999
predicted.gender.ratio <- pfunc(year.range, r, c) # Compute the predicted gender ratio from year 2000 way into the future
ifelse(predicted.gender.ratio[2] > predicted.gender.ratio[1], rising <- TRUE, rising <- FALSE) # Check if it's rising or falling towards 50:50
if(rising) {
if(gender.ratio.at.present < 0.5) parity.year <- year.range[which(predicted.gender.ratio > 0.45)][1] # record first year it got within 5% of parity
else(parity.year <- "Female-biased and becoming more so")
}
else {
if(gender.ratio.at.present > 0.5) parity.year <- year.range[which(predicted.gender.ratio < 0.55)][1]
else(parity.year <- "Male-biased and becoming more so")
}
return(data.frame(gender.ratio.at.present=gender.ratio.at.present, current.rate.of.change=current.rate.of.change, parity.year=parity.year, stringsAsFactors = F))
}
# Now run "find.response.variables" on each set of resampled data, and get the estimated 95% CIs using quantile
resampled.data <- make.resampled.data(real.data, n.authors, chunk.size)
boot.estimates <- do.call("rbind", lapply(1:chunk.size, function(i) find.response.variables(resampled.data %>% filter(boot.replicate == i))))
rm(resampled.data) # help keep memory use low
if(nChunks > 1){
for(ii in 2:nChunks){
resampled.data <- make.resampled.data(real.data, n.authors, chunk.size)
boot.estimates <- rbind(boot.estimates, do.call("rbind", lapply(1:chunk.size, function(i) find.response.variables(resampled.data %>% filter(boot.replicate == i)))))
rm(resampled.data) # help keep memory use low
}
}
CIs.1 <- as.numeric(quantile(boot.estimates$gender.ratio.at.present, probs = c(0.025, 0.975)))
CIs.2 <- as.numeric(quantile(boot.estimates$current.rate.of.change, probs = c(0.025, 0.975)))
if(is.numeric(boot.estimates$parity.year)) {
parity.year <- median(boot.estimates$parity.year + 2000) - 2016.63518
CIs.3 <- as.numeric(quantile(boot.estimates$parity.year + 2000, probs = c(0.025, 0.975))) - 2016.63518
parity.year[parity.year < 0] <- 0
CIs.3[CIs.3 < 0] <- 0
}
else {
Mode <- function(x) {ux <- unique(x); ux[which.max(tabulate(match(x, ux)))]} # if not going to hit parity sometimes, pick the modal (i.e. most likely) non-parity outcome
parity.year <- Mode(boot.estimates$parity.year)
CIs.3 <- c(NA, NA)
}
} ######## end of if(run.sim)
############################# GET R AND C VALUES FROM THE REAL DATA, AS WELL AS THE DATA POINTS NEEDED FOR A GRAPH (here, or in the web app)
# Now find the r and c values using the real dataset (not resamples), and make all the data needed for a plot. This is the main thing needed if export.JSON = T
real.data <- real.data %>% group_by(date) %>% summarise(nFemales = sum(count[gender == "F"]), nMales = sum(count[gender == "M"]), n = nFemales+nMales, proportion.F = nFemales/n) %>% as.data.frame %>% filter(n > 0)
result <- run.optimiser(real.data)
real.r <- result$par[1]
real.c <- result$par[2]
# Find the gender ratio per year from 2002 to 2016 - note that only years with more than 50 authors measured will appear
overlay.points <- real.data %>% mutate(date = floor(date)) %>% group_by(date) %>%
summarise(percent.F = 100 * sum(nFemales) / (sum(nFemales) + sum(nMales)), # % females in focal year
nFemales = sum(nFemales), nMales = sum(nMales), # number of male and female authors in the focal year
n = n()) %>% # 'n' is the number of papers in the focal year
filter(nFemales + nMales > 50) %>% # Only keep years with at least 50 authors measured
mutate(Year = date + 2000) %>% # add a nicer year column, and reorder the columns neatly
select(Year, percent.F, nFemales, nMales, n)
if(nrow(overlay.points) < 5) {
# print("There are too few years, giving up")
return(NULL) # return nothing if we do not have at least 50 gendered authors from at least 5 different years (in addition to having at least 250 authors and 100 papers overall - see above)
}
overlay.points <- cbind(overlay.points, get.CIs(overlay.points$nFemales, overlay.points$nMales))
if(plot){ # Make a nice plot if requested
model.fit <- data.frame(time = seq(0, 50, by = 0.1)) %>% mutate(percent.F = 100 * pfunc(time, real.r, real.c), Year = time + 2000)
print( ggplot() + geom_errorbar(data = overlay.points, aes(x = Year, ymin = lowerCI, ymax = upperCI), colour = "darkgrey", width = 0) +
geom_point(data = overlay.points, aes(x = Year, y = percent.F)) + geom_line(data = model.fit, aes(x = Year, y = percent.F)) + ylab("% women authors") )
}
if(export.JSON){ # function(t, r, c){100 * exp(0.5*r*(t-2000))/(2*exp(0.5*r*(t-2000))+c)} function to predict the % females, where t is the date such that "2004.25" is quarter of the way through 2004
data.for.web.app <- list(
overlay.points %>% select(Year, nFemales, nMales, n, percent.F, lowerCI, upperCI) %>%
rename(Y = Year, F = nFemales, M = nMales, GR = percent.F, lc = lowerCI, uc = upperCI) %>%
mutate(GR = round(GR,1), lc = round(lc,1), uc = round(uc,1)), # note rounding to save hard disk space and download size, important for the web
data.frame(r = signif(real.r,2), c = signif(real.c,2)))
run.JSON.export(data.for.web.app)
}
if(run.sim){
notes <- ""
if(is.numeric(parity.year) & is.na(CIs.3[1])) notes <- "Varying or unmeasurable trends in gender ratio"
if(is.numeric(parity.year) & is.na(CIs.3[2])) notes <- "Varying or unmeasurable trends in gender ratio"
out <- data.frame(journal = item, discipline = item, position = position, n.authors = n.authors, n.papers = n.papers,
gender.ratio.at.present = median(boot.estimates[,1]), low.CI.1 = CIs.1[1], up.CI.1 = CIs.1[2],
current.rate.of.change = median(boot.estimates[,2]), low.CI.2 = CIs.2[1], up.CI.2 = CIs.2[2],
years.to.parity = parity.year, low.CI.3 = CIs.3[1], up.CI.3 = CIs.3[2], r=real.r, c=real.c, notes = notes, stringsAsFactors = F)
out[6:11] <- out[6:11] * 100 # express as percentages
if(filter.type == "disc") out$journal <- "All journals"
if(filter.type == "journal") out$discipline <- discipline
if(filter.type == "arxiv.cat") {
out <- out %>% rename(category = discipline, sub.category = journal)
out$sub.category <- "All subcategories"
}
if(filter.type == "arxiv.subcat") {
out <- out %>% rename(category = discipline, sub.category = journal)
out$category <- category # The cat and sub-cat were saved way up above
out$sub.category <- sub.category
}
if(print.each.one) print(out)
return(out)
}
} # End of big function
# wrapper for the gender.stats function used to make the web app data - feeds in a dataframe of arguments, and does one row from it. For use with lapply
make.web.app.data <- function(row, arguments){
if(row %% 10 == 0) print(row)
output <- "not done yet"
attempts.left <- 5 # Very occasionally fails when the curve is almost totally flat, due to negative numbers in the likelihood or something. Just re-run when this happens
while(attempts.left > 0) {
try(output <- with(arguments, gender.stats(item = item[row], filter.type = filter.type[row], position = position[row], country = country[row],
data.source = pubmed_sqlite, run.sim = F, print.each.one = F, export.JSON = T, plot = F, verbose = F)))
if(!is.character(output)) attempts.left <- 0 # If the function worked, "output" will have been overwritten with NULL, so we can quit already
attempts.left <- attempts.left - 1
}
}
# wrapper for the gender.stats function used to make the web app data - feeds in vector of disciplines or journals, or a custom set (dataframe with columns "item" and "position")
make.figure.one.dataset <- function(items = NULL, filter.type, author.positions = c("overall", "first", "last", "single"), custom.set = NULL, data.source = pubmed_sqlite, chunk.size = 10, nChunks = 100){
if(is.null(custom.set) & is.null(items)) return("Enter something to do")
if(!is.null(custom.set) & !is.null(items)) return("Pick one - 'custom set' or 'items'")
if(is.null(custom.set) & !is.null(items)) to.do <- expand.grid(item = items, position = author.positions) %>% mutate(item = as.character(item), position = as.character(position))
else to.do <- custom.set # you can enter a dataframe with columns "item" and "position"
output <- do.call("rbind", lapply(1:nrow(to.do), function(xx) {
successful <- FALSE; attempts <- 0; answer <- 0
while(!successful & attempts < 10){ # Because they can fail stochastically, try each one a few times.
try(answer <- gender.stats(item = to.do$item[xx], filter.type=filter.type, position = to.do$position[xx], data.source = data.source, chunk.size=chunk.size, nChunks=nChunks,
plot = F, print.each.one = T, export.JSON = F, run.sim = T))
if(!is.numeric(answer)) successful <- TRUE
attempts <- attempts + 1
}
if(attempts == 10) {
print("Tried 10 times, giving up")
return(NULL)
}
else return(answer)
}
))
output <- output %>% mutate(position = factor(position, levels = c("first", "last", "single", "overall")))
if("discipline" %in% names(output)) output <- output %>% arrange(discipline, position)
if("category" %in% names(output)) output <- output %>% arrange(category, sub.category, position)
output
}
# Get all possible combinations of journal/disc/country/author position, which we will need to make the data for the web app
make.list.of.combos.for.web.app <- function(unique.discs, unique.journals, unique.countries){
arguments <- data.frame(item = c(unique.discs, unique.journals),
filter.type = as.character(unlist(mapply(rep, c("disc", "journal"), each = c(length(unique.discs), length(unique.journals))))),
stringsAsFactors = F)
nr <- nrow(arguments)
arguments <- arguments[rep(1:nr, 4), ]
arguments$position <- rep(c("first", "last", "single", "overall"), each = nr)
nr <- nrow(arguments)
arguments <- arguments[rep(1:nr, 1 + length(unique.countries)), ]
arguments$country <- rep(c("all", unique.countries), each = nr)
suppl.args <- data.frame(item = "all", filter.type = "everything.for.one.country", position = NA, country = unique.countries)
nr <- nrow(suppl.args)
suppl.args <- suppl.args[rep(1:nr, 4), ]
suppl.args$position <- rep(c("first", "last", "single", "overall"), each = nr)
arguments <- rbind(arguments, suppl.args, data.frame(item = "all", filter.type = "everything.for.all.countries", position = c("first", "last", "single", "overall"), country = "all"))
rm(suppl.args)
to.keep <- rep(TRUE, nrow(arguments))
to.keep[arguments$filter.type == "journal" & arguments$country != "all"] <- FALSE
for(i in 1:length(unique.journals)){
print(i)
country.column.for.this.journal <- (pubmed_sqlite %>% filter(journal == unique.journals[i]) %>% select(country) %>% collect(n = Inf) %>% as.data.frame())[,1]
country.column.for.this.journal <- paste0(unique(country.column.for.this.journal), collapse = "") # paste all unique affiliation lists it into a single item
countries.present <- unique.countries[str_detect(country.column.for.this.journal, unique.countries)] # the names of all the countries represented in this journal
to.keep[arguments$filter.type == "journal" & arguments$item == unique.journals[i] & arguments$country %in% countries.present] <- TRUE
}
arguments <- arguments[to.keep, ]
arguments <- arguments[sample(nrow(arguments)), ] # shuffle the rows so that work is distributed evenly across cores
row.names(arguments) <- NULL
arguments
}
# Function used to tidy the data prior to export as supplementary files
neaten.the.data <- function(dd){
dd[, names(dd) %in% c("gender.ratio.at.present", "low.CI.1", "up.CI.1", "current.rate.of.change", "low.CI.2", "up.CI.2", "low.CI.3", "up.CI.3", "r", "c")] <- round(dd[, names(dd) %in% c("gender.ratio.at.present", "low.CI.1", "up.CI.1", "current.rate.of.change", "low.CI.2", "up.CI.2", "low.CI.3", "up.CI.3", "r", "c")], 3) # some rounding
dd <- dd %>% select(-notes) # empty column removed
dd$qualitative.result <- ""
dd$qualitative.result[dd$gender.ratio.at.present < 50 & dd$current.rate.of.change > 0] <- "Male-biased and unchanging" # Add these easier-to-read results
dd$qualitative.result[dd$gender.ratio.at.present > 50 & dd$current.rate.of.change > 0] <- "Female-biased and unchanging"
dd$qualitative.result[dd$gender.ratio.at.present < 50 & dd$current.rate.of.change > 0] <- "Male-biased but improving"
dd$qualitative.result[dd$gender.ratio.at.present > 50 & dd$current.rate.of.change < 0] <- "Female-biased but improving"
dd$qualitative.result[dd$gender.ratio.at.present < 50 & dd$current.rate.of.change < 0] <- "Male-biased and becoming more so"
dd$qualitative.result[dd$gender.ratio.at.present > 50 & dd$current.rate.of.change > 0] <- "Female-biased and becoming more so"
dd$qualitative.result[dd$gender.ratio.at.present > 45 & dd$gender.ratio.at.present < 55] <- "Essentially at parity"
dd$years.to.parity <- round(suppressWarnings(as.numeric(dd$years.to.parity)),3) # Tidy this, which originally had characters mixed in
dd
}
clean.up.data.for.figures <- function(df, filter.type){
df$qualitative.result <- NA
df$qualitative.result[!is.na(suppressWarnings(as.numeric(df$years.to.parity)))] <- NA
df$qualitative.result[df$gender.ratio.at.present < 50 & df$current.rate.of.change < 0] <- "Male-biased and becoming more so"
df$qualitative.result[df$gender.ratio.at.present > 50 & df$current.rate.of.change > 0] <- "Female-biased and becoming more so"
df$qualitative.result[df$gender.ratio.at.present > 45 & df$gender.ratio.at.present < 55] <- "Essentially at parity"
df$low.CI.3[df$gender.ratio.at.present > 45 & df$gender.ratio.at.present < 55] <- NA
df$up.CI.3[df$gender.ratio.at.present > 45 & df$gender.ratio.at.present < 55] <- NA
# Remove the dots for items that might be either going up or going down, and we're not sure (i.e. the rate of change is indistinguishable from zero)
df$years.to.parity[is.na(df$low.CI.3)] <- NA
df$years.to.parity[is.na(df$up.CI.3)] <- NA
df$years.to.parity <- suppressWarnings(as.numeric(df$years.to.parity))
df$years.to.parity[df$qualitative.result == "Essentially at parity"] <- 0
df$position[df$position == "overall"] <- "Overall"
df$position[df$position == "first"] <- "First"
df$position[df$position == "last"] <- "Last"
df$position[df$position == "single"] <- "Single"
df$position <- factor(df$position, levels = c("Overall", "First", "Last", "Single"))
if(filter.type == "disc") df <- df %>% arrange(position, gender.ratio.at.present) %>% mutate(discipline = factor(discipline, levels = unique(discipline)))
if(filter.type == "journal") df <- df %>% arrange(position, discipline, gender.ratio.at.present) %>% mutate(journal = factor(journal, levels = unique(journal)))
df$position <- factor(df$position, levels = c("First", "Last", "Single", "Overall"))
return(df)
}
# Big function to make figures lik Figs 1 and 2
main.figure <- function(dataset, xlim1, xlim2 = c(-2,2), add.density = T, add.density3 = T, custom.remove.first = F, single.present = T){
cols <- c(brewer.pal(3, "Set1"), "black")
if(!single.present) cols <- c(brewer.pal(3, "Set1")[1:2], "black")
longest.name <- dataset$discipline[nchar(as.character(dataset$discipline)) == max(nchar(as.character(dataset$discipline)))][1]
d1 <- dataset %>% ggplot(aes(x = gender.ratio.at.present, fill = position, colour = position)) + theme_classic() +
theme(legend.position = "none", title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank(), axis.text.y = element_text(colour = "white"), axis.line.y = element_blank()) +
scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0), breaks = 0, labels = longest.name) + coord_cartesian(xlim = xlim1) +
geom_density(alpha=0.1) + xlab(NULL) + ylab("Frequency") + scale_fill_manual(values = cols) + scale_colour_manual(values = cols)
d2 <- dataset %>% ggplot(aes(x = current.rate.of.change, fill = position, colour = position)) + theme_classic() +
theme(legend.position = "none", text = element_blank(), title = element_blank(), axis.ticks = element_blank(), axis.line.y = element_blank()) +
scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + coord_cartesian(xlim = xlim2) +
geom_density(alpha=0.1) + xlab(NULL) + ylab(" ") + scale_fill_manual(values = cols) + scale_colour_manual(values = cols) + theme(legend.position = "none")
d3.data <- dataset
if(custom.remove.first){
d3.data <- dataset %>% filter(position != "First") %>% mutate(position = factor(position, levels = c("Last", "Single", "Overall")))
new.cols <- c(brewer.pal(3, "Set1")[2:3], "black")
}
d3 <- d3.data %>% ggplot(aes(x = years.to.parity, fill = position, colour = position)) + theme_classic() +
theme(legend.position = "none", text = element_blank(), title = element_blank(), axis.ticks = element_blank(), axis.line.y = element_blank()) +
scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + coord_cartesian(xlim = c(0,100)) + xlab(NULL) + ylab(" ") + theme(legend.position = "none")
if(add.density3) {
if(!custom.remove.first) d3 <- d3 + geom_density(alpha=0.1) + scale_fill_manual(values = cols) + scale_colour_manual(values = cols)
if(custom.remove.first) d3 <- d3 + geom_density(alpha=0.1) + scale_fill_manual(values = new.cols) + scale_colour_manual(values = new.cols)
}
else d3 <- d3 + theme(axis.line.x = element_blank())
p1 <- dataset %>% ggplot(aes(x = gender.ratio.at.present, y = discipline, colour = position)) + geom_vline(xintercept = 50, linetype=2) +
geom_errorbarh(alpha = 0.6, height = 0, aes(xmin = low.CI.1, xmax = up.CI.1)) + geom_point(alpha=0.8) + xlab("% women authors\nin 2016") + ylab(NULL) + scale_colour_manual(values = cols) + labs(colour = "Author\nposition") + coord_cartesian(xlim = xlim1)
p2 <- dataset %>% ggplot(aes(x = current.rate.of.change, y = discipline, colour = position)) + geom_vline(xintercept = 0, linetype=2) +
geom_errorbarh(alpha = 0.6, height = 0, aes(xmin = low.CI.2, xmax = up.CI.2)) + geom_point(alpha=0.8) + coord_cartesian(xlim = xlim2) +
xlab("Change in % women\nauthors per year") + ylab(NULL) + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.position = "none") + scale_colour_manual(values = cols)
p3 <- dataset %>% ggplot(aes(x = years.to.parity, y = discipline, colour = position)) +
geom_errorbarh(alpha = 0.6, height = 0, aes(xmin = low.CI.3, xmax = up.CI.3)) + geom_point(alpha=0.8) +
xlab("Years until parity") + ylab(NULL) + theme(legend.position = "none", axis.text.y = element_blank(), axis.ticks.y = element_blank()) + scale_colour_manual(values = cols) + coord_cartesian(xlim = c(0, 100))
g <- ggplotGrob(p1 + theme(legend.position = "right"))$grobs # Get the legend from p1
legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
lwidth <- sum(legend$width) # save the width of the legend
gl.d <- lapply(list(d1,d2,d3), function(x) ggplotGrob(x + theme(legend.position="none"))) # convert the plots to ggplotGrobs to allow cbind
gl <- lapply(list(p1,p2,p3), function(x) ggplotGrob(x + theme(legend.position="none"))) # convert the plots to ggplotGrobs to allow cbind
if(add.density == T){
plots.combined <- arrangeGrob(cbind(gl.d[1][[1]], gl.d[2][[1]], gl.d[3][[1]]),
cbind(gl[1][[1]], gl[2][[1]], gl[3][[1]]),
nrow = 2,
heights = c(0.1,0.9))
}
else plots.combined <- arrangeGrob(cbind(gl[1][[1]], gl[2][[1]], gl[3][[1]]))
combined <- arrangeGrob(plots.combined, legend, ncol = 2, widths = unit.c(unit(1, "npc") - lwidth, lwidth)) # add legend
grid.newpage()
grid.draw(combined) # draw it
}