-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0_scraptable.do
78 lines (66 loc) · 2.15 KB
/
0_scraptable.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
cap prog drop scraptable
program define scraptable
syntax anything(name=page equalok everything),[Varnames] [Debug] [Html]
di `"Reading `page'"'
quietly {
clear
set obs 1
gen page=fileread(`"`page'"')
if page=="fileread() error 601" {
di "`page' could not be read"
error 601
}
striphtmlcomments page
replace page=subinstr(page,"<th","<td",.)
replace page=subinstr(page,"</th","</td",.)
gen scratch=page
replace scratch=subinstr(scratch,"<table","",.)
gen numTables=(length(page)-length(scratch))/6
replace scratch=page
l numTables
forval t=1/`=numTables[1]' {
gen start`t'=strpos(scratch,"<table")
gen end`t'=strpos(scratch,"</table")
gen table`t'=substr(scratch,start`t',end`t'-start`t'+7)
gen numRows`t'=(length(table`t')-length(subinstr(table`t',"<tr","",.)))/3
replace scratch=subinstr(scratch,"<table","",1)
replace scratch=subinstr(scratch,"</table","",1)
}
egen maxRows=rowmax(numRows*)
set obs `=maxRows[1]'
forval t=1/`=numTables[1]' {
replace scratch=table`t'[1]
gen row`t'=""
forval i=1/`=numRows`t'[1]' {
replace row`t'=trim(substr(scratch[1],strpos(scratch[1],"<tr"), strpos(scratch[1],"</tr")-strpos(scratch[1],"<tr")+3)) if _n==`i'
replace scratch=subinstr(scratch,row`t'[`i'],"",1) in 1
}
replace row`t'=substr(row`t',strpos(row`t',">")+1,.)
}
gen temp=.
forval t=1/`=numTables[1]' {
replace temp=(length(row`t')-length(subinstr(row`t',"<td","",.)))/3
egen numCols`t'=max(temp)
}
forval t=1/`=numTables[1]' {
replace scratch=row`t'
forval i=1/`=numCols`t'[1]' {
replace scratch=substr(scratch,strpos(scratch,">")+1,.)
//l scratch
gen t`t'c`i'=substr(scratch,1,strpos(scratch,"</td")-1)
replace scratch=substr(scratch,strpos(scratch,"<td"),.)
if "`html'"=="" striphtml t`t'c`i'
}
}
if "`debug'"=="" keep t*c* numTables numCols*
if "`varnames'"=="varnames" {
forval t=1/`=numTables[1]' {
forval i=1/`=numCols`t'[1]' {
capture rename t`t'c`i' `=strtoname(t`t'c`i'[1])'
}
}
drop in 1
}
if "`debug'"=="" drop numTables numCols*
}
end