-
Notifications
You must be signed in to change notification settings - Fork 0
/
01-extract-wv.R
60 lines (39 loc) · 2.22 KB
/
01-extract-wv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
########################################################################
# Read Violations.TXT
# Example of epic data munging to pull the desired records out of a file that's too big to read all at once
# The first batch should include the header row
skip_count <- 250000
start_row <- skip_count + 1
vdat <- read.table('./data/msha_source/Violations.TXT', nrows=skip_count, header=T, sep="|", fill=T, as.is=c(1:55), quote="\"",comment.char = "")
vnames <- names(vdat)
vdat <- merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , vdat )
# The second batch starts where the first left off, then picks up the names from it.
# note that the number of rows skipped includes the header
# Violations.TXT has about 1.5 million rows
while (start_row < 2000000) {
print(paste("About to scan records ", start_row, " through " , start_row + skip_count))
vdat_temp <- read.table('./data/msha_source/Violations.TXT', nrows=skip_count, header=F, sep="|", fill=T, as.is=c(1:55), skip = start_row ,quote="",comment.char = "")
names(vdat_temp) <- vnames
vdat <- rbind(vdat, merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , vdat_temp ))
print(paste("Rows collected: " , nrow(vdat)))
# print(paste("Events ", vdat[last_row,c("EVENT_NO")] , " through " , vdat[nrow(vdat),c("EVENT_NO")]))
start_row <- start_row + skip_count
# last_row <- nrow(vdat)
}
write.csv(vdat, "./data/wv_vdat.csv")
########################################################################
# Read Inspections.TXT
skip_count <- 250000
start_row <- skip_count + 1
idat <- read.table('./data/msha_source/Inspections.TXT', nrows=skip_count, header=T, sep="|", fill=T, quote="\"",comment.char = "")
inames <- names(idat)
idat <- merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , idat )
while (start_row < 2000000) {
print(paste("About to scan records ", start_row, " through " , start_row + skip_count))
idat_temp <- read.table('./data/msha_source/Inspections.TXT', nrows=skip_count, header=F, sep="|", fill=T, skip = start_row ,quote="\"",comment.char = "")
names(idat_temp) <- inames
idat <- rbind(idat, merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , idat_temp ))
print(paste("Rows collected: " , nrow(idat)))
start_row <- start_row + skip_count
}
write.csv(idat, "./data/wv_idat.csv")