-
Notifications
You must be signed in to change notification settings - Fork 0
/
msha_pandas.py
214 lines (144 loc) · 7.23 KB
/
msha_pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#%%
import pandas as pd
import numpy as np
import dateutil
import os
import statsmodels.api as sm
import lifelines
os.chdir('/Users/mfrost/Code/MSHA')
#%%
mines = pd.read_table('data/msha_source/Mines.txt', sep='|' , parse_dates=['CURRENT_CONTROLLER_BEGIN_DT'])
mines_index = mines.set_index(['STATE','MINE_ID','CURRENT_MINE_STATUS','CURRENT_STATUS_DT','CURRENT_CONTROLLER_ID','CURRENT_CONTROLLER_NAME','CURRENT_OPERATOR_ID','CURRENT_OPERATOR_NAME'])
accidents = pd.read_table('data/msha_source/Accidents.txt', sep='|',parse_dates=['ACCIDENT_DT','INVEST_BEGIN_DT'])
accidents_index = accidents.set_index(['MINE_ID','CAL_YR','CAL_QTR','ACCIDENT_DT','CONTROLLER_ID','CONTROLLER_NAME','OPERATOR_ID','OPERATOR_NAME'])
inspections = pd.read_table('data/msha_source/Inspections.txt', sep='|', parse_dates=['INSPECTION_BEGIN_DT','INSPECTION_END_DT'])
inspections_index = inspections.set_index(['EVENT_NO','MINE_ID','CAL_YR','CAL_QTR','INSPECTION_BEGIN_DT','INSPECTION_END_DT','CONTROLLER_ID','CONTROLLER_NAME','OPERATOR_ID','OPERATOR_NAME','ACTIVITY_CODE','PROGRAM_AREA','COAL_METAL_IND'])
violations = pd.read_table('data/msha_source/Violations.txt', sep='|', parse_dates=['VIOLATION_ISSUE_DT','VIOLATION_OCCUR_DT','INSPECTION_BEGIN_DT','INSPECTION_END_DT','CONTESTED_DT'])
violations_index = violations.set_index(['VIOLATION_NO','EVENT_NO','MINE_ID','CAL_YR','CAL_QTR','INSPECTION_BEGIN_DT','INSPECTION_END_DT','CONTROLLER_ID','CONTROLLER_NAME','MINE_TYPE','COAL_METAL_IND','VIOLATION_OCCUR_DT','VIOLATION_ISSUE_DT','SIG_SUB','LIKELIHOOD','INJ_ILLNESS','NO_AFFECTED','NEGLIGENCE'])
# For unknown reasons, the VIOLATION_DT field isn't being parsed as a date
violations['VIOLATION_OCCUR_DT']=violations['VIOLATION_OCCUR_DT'].apply(dateutil.parser.parse)
# Full Data Loading Complete
#%%
# Filter to just a state of interest
sub_state = 46
mine_ids = mines['MINE_ID'][mines['BOM_STATE_CD'] == sub_state]
mines = mines[:][mines['BOM_STATE_CD'] == sub_state]
mines.to_csv('data/wv_mines.csv',index='FALSE')
violations = violations[:][violations['MINE_ID'].isin(mine_ids)]
violations.to_csv('data/wv_violations.csv',index='FALSE')
inspections = inspections[:][inspections['MINE_ID'].isin(mine_ids)]
inspections.to_csv('data/wv_inspections.csv',index='FALSE')
accidents = accidents[:][accidents['MINE_ID'].isin(mine_ids)]
accidents.to_csv('data/wv_accidents.csv',index='FALSE')
# Filtering complete
#%%
# Aggregation tasks:
# - build a list of operating days for each mine
# - given a mine, a date, and a trailing window, return the number of violations,
# accidents, fatalities, and inspections that the window contains.
# - integrate available production data
# Operating days per mine
# and trailing event counts can be done together now?
# From http://stackoverflow.com/questions/15489011/python-time-series-alignment-and-to-date-functions
def i_counter(grouped):
se = grouped.set_index('INSPECTION_BEGIN_DT')['EVENT_NO']
# se is the time series of inspection dates restricted to a single MINE_ID
se = se.resample("D")
df = pd.DataFrame({'event_no':se, 'i90':pd.rolling_count(se, 90), 'i30':pd.rolling_count(se, 30)})
return df
# Test:
df_test = inspections[:][inspections['MINE_ID'].isin([4601318, 4601456])].groupby('MINE_ID').apply(i_counter)
df_test.tail(10)
# Run all the inspections:
i_days = inspections.groupby('MINE_ID').apply(i_counter)
i_days.tail(10)
# The MINE_ID and dates are now in a MultiIndex, and can't be
# queried as columns
# Use the .xs() function to select a cross-section
i_days.xs(4601318).tail(20)
#%%
# Accidents
def a_counter(grouped):
se = grouped.set_index('ACCIDENT_DT')['DEGREE_INJURY_CD']
# se is the time series of accident dates restricted to a single MINE_ID
se = se.resample("D")
df = pd.DataFrame({'degree_injury_cd':se, 'a90':pd.rolling_count(se, 90), 'a30':pd.rolling_count(se, 30)})
# TODO: include a sum of injury counts by day
return df
# Test:
df_test = accidents[:][accidents['MINE_ID'].isin([4601318, 4601456])].groupby('MINE_ID').apply(a_counter)
df_test.tail(10)
# Run all the accidents:
a_days = accidents.groupby('MINE_ID').apply(a_counter)
a_days.tail(10)
#%%
# Violations
violations.VIOLATION_NO = violations.VIOLATION_NO.convert_objects(convert_numeric=True)
# to find a date:
violations[:][(violations['VIOLATION_OCCUR_DT']=='20130819') & (violations['MINE_ID']==4601456)]
violations['VIOLATION_NO'][(violations['VIOLATION_OCCUR_DT']=='20130819') & (violations['MINE_ID']==4601456)]
v_test=violations[:][violations['MINE_ID'].isin([4601318, 94601456])]
grouped = violations.groupby(['MINE_ID','VIOLATION_OCCUR_DT'])
v_days = grouped['VIOLATION_NO'].agg({'vcount':np.size})
# Having trouble resampling the days when the date is in the MultiIndex
# http://stackoverflow.com/questions/15799162/resampling-within-a-pandas-multiindex
v_days.reset_index(level=[0,1], inplace=True)
v_days = pd.DataFrame(v_days)
v_days.head()
v_days['VIOLATION_OCCUR_DT'] = pd.to_datetime(v_days['VIOLATION_OCCUR_DT'])
v_days.head()
v_days = v_days.sort(['MINE_ID','VIOLATION_OCCUR_DT'])
v_days.head()
v_days.set_index(pd.DatetimeIndex(v_days['VIOLATION_OCCUR_DT']), inplace=True)
# Resample and fill non-violation days
def _sum(x):
if len(x) == 0: return 0
else: return sum(x)
v_days.resample('D',how=_sum)
del v_days['VIOLATION_OCCUR_DT']
v_days.groupby('MINE_ID').resample('D',how=_sum).head()
v_days = v_days.groupby(['MINE_ID']).resample('D',how=_sum)
del v_days['MINE_ID']
v30 = v_days.apply(lambda x: pd.rolling_sum(x,30,min_periods=1))
v30.columns = ['v30']
v90 = v_days.apply(lambda x: pd.rolling_sum(x,90,min_periods=1))
v90.columns = ['v90']
v_days['v30'] = v30.v30
v_days['v90'] = v90.v90
v_days.head(30)
#%%
# Merge accidents to inspections and violations
# First, rename all the index keys
i_days.index.names = ['MINE_ID','OP_DT']
a_days.index.names = ['MINE_ID','OP_DT']
v_days.index.names = ['MINE_ID','OP_DT']
ia_days = pd.merge(i_days, a_days, how='left', left_index= True , right_index= True)
iav_days = pd.merge(ia_days, v_days, how='left', left_index=True, right_index=True)
# with the entire US data set, it processed 63,252,285 records in between 3 and 5 hours
iav_days['has_acc'] = iav_days['degree_injury_cd'] >= 0
iav_days[iav_days['has_acc'] == 1].head()
#%%
iav_days.to_csv('data/iav_days.csv',index='FALSE')
#%%
op_days = iav_days.reset_index(level=[0])
submines = mines[['MINE_ID','COAL_METAL_IND','CURRENT_MINE_TYPE','CURRENT_CONTROLLER_ID','CURRENT_OPERATOR_ID']]
op_days = pd.merge(op_days, submines, on='MINE_ID')
#%% Logit
# Drop the na values because logit can't handle them
op30 = op_days[['i30','v30','a30','has_acc']].dropna(axis=0)
logit = sm.Logit(op30['has_acc'], op30[['v30','i30','a30']])
result = logit.fit()
print result.summary()
# look at the confidence interval of each coeffecient
print result.conf_int()
# odds ratios
print np.exp(result.params)
params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print np.exp(conf)
# Trailing inspections and violations inside an arbitrary window doesn't result in a very cool model.
# Survival analysis might be better.
# TODO: convert the data toright-censored durations and use the lifelines
# library: http://nbviewer.ipython.org/github/CamDavidsonPilon/lifelines/blob/master/Tutorial%20and%20Examples.ipynb