-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_read_data_scans.R
129 lines (116 loc) · 5.25 KB
/
0_read_data_scans.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# 0_read_data_scans.R
# read the data from the study Fibroscanner
# May 2023
library(readxl)
library(dplyr)
library(tidyverse)
library(janitor)
source('99_functions.R')
# find all the files and read them in
data_location = 'U:\\Research\\Projects\\ihbi\\aushsi\\locatenafld\\TRIAL\\Data collection\\scans'
to_read = dir(data_location, pattern='xlsx$')
raw_scans = NULL
for (f in to_read){ # used `text` for all to make binding easier
this_scan = readxl::read_excel(path = paste(data_location, '\\', f, sep=''), col_types='text', sheet='Data') %>%
mutate(file = str_remove(f, '\\.xlsx')) %>% # add file name
clean_names() %>%
mutate(height = as.numeric(height),
weight = as.numeric(weight)
)
# name check
name_check = function(){
n = names(this_scan)
index = grep(pattern='^e_', x=n)
index = grep(pattern='^cap_', x=n)
e = n[index]
cat(f, '\n')
cat(paste(e, collapse='\n'), '\n\n')
}
## changes based on which type of scanner:
# rename for consistency
scanner1 = 'valid_measures_number' %in% names(this_scan)
if(scanner1 == TRUE){
this_scan = rename(this_scan,
'total_measurement_number' = 'valid_measures_number')
}
# rename for consistency
scanner1 = 'e_med_k_pa' %in% names(this_scan)
if(scanner1 == TRUE){
this_scan = rename(this_scan,
'e_median_k_pa' = 'e_med_k_pa',
'e_iqr_median_percent' = 'e_iqr_e_med')
}
# rename for consistency
scanner1 = 'cap_med_d_b_m' %in% names(this_scan)
if(scanner1 == TRUE){
this_scan = rename(this_scan,
'cap_median_d_b_m' = 'cap_med_d_b_m',
'cap_iqr_d_b_m' = 'cap_iqr_d_b_m')
}
# make numbers
this_scan = mutate(this_scan,
e_median_k_pa = as.numeric(e_median_k_pa),
e_iqr_k_pa = as.numeric(e_iqr_k_pa),
e_iqr_median_percent = as.numeric(e_iqr_median_percent),
cap_median_d_b_m = as.numeric(cap_median_d_b_m),
cap_iqr_d_b_m = as.numeric(cap_iqr_d_b_m),
total_measurement_number = as.numeric(total_measurement_number))
# makes numbers for extra variables
scanner1 = 'cap_enhanced_mean_d_b_m' %in% names(this_scan)
if(scanner1 == TRUE){
this_scan = mutate(this_scan,
cap_enhanced_mean_d_b_m = as.numeric(cap_enhanced_mean_d_b_m),
cap_enhanced_sd_d_b_m = as.numeric(cap_enhanced_sd_d_b_m),
cap_level_percent = as.numeric(cap_level_percent))
}
# concatenate
raw_scans = bind_rows(raw_scans, this_scan)
}
# tidy up scan data
# not yet confirmed that these are all the variables we need
scans = separate(raw_scans, exam_time, into=c('exam_hour','exam_min','exam_sec'), sep=':') %>%
mutate(
record_id = coalesce(code, pharma_study_code, pharma_patient_code), # ID number could be in any of these
# dates
exam_date = as.Date(ISOdate(year=exam_date_year, month=exam_date_month, day=exam_date_day)),
exam_datetime = ISOdate(year=exam_date_year, month=exam_date_month, day=exam_date_day, hour=exam_hour, min=exam_min, sec=exam_sec, tz='Australia/Brisbane'),
exam_duration_min = as.numeric(str_replace_all(exam_duration_min, ',', '.')), # remove comma
exam_duration_min = ifelse(!is.na(exam_duration_s), as.numeric(exam_duration_s)/60, exam_duration_min), # use exam duration in seconds if available
#
height = as.numeric(height),
height = ifelse(height > 100, height/100, height), # convert cm to m
weight = as.numeric(weight),
bmi = weight/(height*height),
total_measurement_number = as.numeric(total_measurement_number),
birth_date_year = as.numeric(birth_date_year)) %>%
separate(record_id, into=c('centre','num'), sep = '-', remove=FALSE) %>% # extract centre number; keep original record id variable
select('record_id','file','centre','num','gender','height','weight','bmi','birth_date_year',
'fasting','exam_date', 'exam_datetime','exam_duration_min','exam_type',
'total_measurement_number', starts_with('e_'), starts_with('cap_')) %>%
select(-contains('_number_')) %>% # do not need individual readings
filter(!is.na(record_id)) %>% # remove missing rows
rename('probe_size' = 'exam_type') %>%
mutate(probe_size = ifelse(probe_size=='Medium', 'M', probe_size)) %>%
unique() # remove duplicates
# missing scans - no longer needed
#source('0_missing_scans.R')
# check double scans
tab = table(scans$record_id)
double = names(which(tab>2))
View(filter(scans, record_id == double[4]))
# Some almost duplicate rows with minor differences
# If same patient has two scans, then just use latest and/or with longest exam duration
scans = group_by(scans, record_id) %>%
select(-file) %>% # source file no longer needed
arrange(record_id, desc(exam_datetime), desc(exam_duration_min)) %>% # desc for latest time and longest duration
slice(1) %>%
ungroup() %>%
mutate(centre = as.numeric(centre),
num = as.numeric(num))
# save
save(scans, file = 'data/0_scans.RData')
table(table(scans$record_id)) # should be 1
## Compare scan dates with REDCap dates
load('data/1_redcap_data.RData')
data = select(data, record_id, fibroscan_date_2, kpa, cap) # slim down data
compare = left_join(scans, data, by='record_id')