-
Notifications
You must be signed in to change notification settings - Fork 0
/
MSHA_post_pandas.Rmd
140 lines (96 loc) · 4.25 KB
/
MSHA_post_pandas.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
Analysis of panda-provided Modeled Data
========================================================
```{r}
require(reshape)
require(plyr)
require(ggplot2)
require(lubridate)
require(xtable)
require(scales)
require(glmnet)
require(survival)
require(TestSurvRec)
setwd('~/Code/MSHA/')
source('./lib/calendarHeat.r')
Sys.setlocale(locale="C")
```
```{r read.csv.files}
mines <- read.csv('data/wv_mines.csv', stringsAsFactors=FALSE)
op_days <- read.csv('data/iav_days.csv', stringsAsFactors=FALSE)
op_days$op_date <- gsub('-','/', gsub(' 00:00:00','',op_days$OP_DT))
op_days$op_date <- ymd(op_days$op_date)
op_days$op_year <- year(op_days$op_date)
op_days$op_month <- month(op_days$op_date)
op_days$op_day <- day(op_days$op_date)
op_days$fakeday <- ymd(paste('2012',op_days$op_month,op_days$op_day,sep='/'))
```
Pick a few mines and plot their profiles
```{r sample.plots}
# Use mines with abundant data
sample.mines <- subset(ddply(op_days, .(MINE_ID), summarize, vsum=sum(vcount)), vsum > 1000)$MINE_ID
sample.mines <- sample(sample.mines, 3)
sample.mines
sample.dat <- op_days[op_days$MINE_ID %in% sample.mines & op_days$op_year>2009,]
ggplot(sample.dat, aes(x=fakeday)) +
geom_line(aes( y=a30), color='red') +
geom_line(aes(y=a90), color='red', linetype='dashed') +
geom_line(aes(y=i30), color='green')+
geom_line(aes(y=v30), color='orange')+
facet_grid(op_year~MINE_ID)
```
Trying the `glm` library's logit model
```{r logit}
logit.acc <- glm(has_acc ~ a30 + v30 + i30, data = op_days, family = "binomial")
summary(logit.acc)
confint(logit.acc)
mine.day.stats$pred.acc <- predict(logit.acc, op_days[,c('v30','a30')])
ggplot(op_days) + geom_point(aes(x=v30, y=pred.acc, color=a30))
```
Join some useful categories to the per mine/day stats and flag accident days
```{r mine.joins}
mine.day.stats <- merge(mine.day.stats,
mdat[
,
c('MINE_ID','CURRENT_MINE_NAME','COAL_METAL_IND','CURRENT_OPERATOR_NAME','CURRENT_CONTROLLER_NAME','STATE','FIPS_CNTY_NM','NO_EMPLOYEES')
] ,
by.x=c('mine'),
by.y=c('MINE_ID')
)
mine.day.stats$bin.inj <- ifelse(is.na(mine.day.stats$DEGREE_INJURY_CD), 0, 1)
mine.day.stats$bin.acc <- ifelse(is.na(mine.day.stats$CLASSIFICATION), 0, 1)
head(subset(mine.day.stats, p.acc==1.0))
```
```{r two.mine.plot}
table(mine.day.stats$CLASSIFICATION)
ggplot(mine.day.stats) +
geom_segment(data=
mine.day.stats[
mine.day.stats$bin.acc==1.0 & grepl('FIRE |IGNITION', mine.day.stats$CLASSIFICATION)
,
]
, aes(x=op.dt,xend=op.dt, y=150, yend=0),color='yellow') +
geom_line(aes(x=op.dt,y=trailing.violations)) +
geom_line(aes(x=op.dt,y=trailing.accidents),color='red') +
geom_line(aes(x=op.dt,y=trailing.inspections),color='blue') +
facet_grid(CURRENT_MINE_NAME~.)
ggplot(mine.day.stats) +
geom_point(aes(x=trailing.accidents,y=trailing.violations)) + stat_smooth(aes(x=trailing.accidents,y=trailing.violations))
ggplot(mine.day.stats[mine.day.stats$mine==4601318, ]) + geom_line(aes(x=op.dt, y=trailing.violations), color='red')
```
Survival analysis approach
Data format explained [here](http://stats.stackexchange.com/questions/59636/how-do-you-prepare-longitudinal-data-for-survival-analysis)
Instead of day-by-day records, calculate accident-free periods with start and end dates, and assign a status of 1 for each accident
```{r survival}
# transform accident records into periods
# each accident is the end date of a period, and the preceding accident is the maximum date less than it
periods <-data.frame(
MINE_ID=adat$MINE_ID,
tstop =adat$accident.dt)
periods <- ddply(periods, .(MINE_ID, tstop), function(x) c(status=1, tstart=max(periods[periods$MINE_ID==x$MINE_ID & periods$tstop < x$tstop,'tstop'])))
periods$time <- as.numeric(periods$tstop - periods$tstart)/(3600*24)
periods$time <- ifelse(IsInf(periods$time), NULL, periods$time)
periods$time <- ifelse(is.infinite(periods$time), NA, periods$time)
periods$id <- periods$MINE_ID
plot(survfit(Surv(time, status) ~ 1, data=periods[periods$id %in% periods[1:20,'id'],] ),
conf.int=FALSE, mark.time=FALSE)
```