-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcandidateScrape.R
59 lines (53 loc) · 2.89 KB
/
candidateScrape.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(stringr)
candidates = read.csv("candidate.csv", header=F, sep=",", stringsAsFactors=FALSE, encoding = "UTF-8")
baseUrl = "http://www.google.com/search?q="
candidateFile <- "candidate2.txt" # The candidate file to write
write("DROP TABLE IF EXISTS `candidate`;", candidateFile)
write("CREATE TABLE presidential_elections.candidate ( name VARCHAR(64) NOT NULL , birth_date DATE , death_date DATE , image_url VARCHAR(128) );", candidateFile, append=T)
write("INSERT INTO candidate (name, birth_date, death_date, image_url) VALUES ", candidateFile, append=T)
for (i in 1:length(candidates)) {
webQueryString <- gsub(" ", "+", candidates[[i]])
url <- sprintf("%s%s%s", baseUrl, webQueryString, "+politician")
print(sprintf("%s %d", candidates[[i]], i))
print(url)
html <- paste(readLines(url, encoding='UTF-8'), collapse="\n")
Sys.sleep(1) # Because R is not good for web crawling, need to sleep or it will randomly fail frequently
matched <- str_match_all(html, "en.wikipedia.org/wiki/([:alpha:]*\\.?_?\\(?\\)?%?[:digit:]?'?)*")
wikiUrl <- sprintf("%s%s", "http://", matched[[1]][, 1][1]) # Get a url to wikipedia page
wikiUrl <- gsub("%.*9", "%e9", wikiUrl) # Handle the John Fremont outlier case which gets messed up because of character encoding
print(wikiUrl)
wikihtml <- paste(readLines(wikiUrl), collapse="\n")
bdaymatch <- str_match_all(wikihtml, "span class=\"bday\">(..........)</span") # Get the birthday string if it exists
deathdaymatch <- str_match_all(wikihtml, "span class=\"dday deathdate\">(..........)</span") # Get the birthday string if it exists
print(bdaymatch[[1]][, 2][1]) # Great these are already in database format
print(deathdaymatch[[1]][, 2][1])
photoUrl <- str_match_all(wikihtml, "src=\"(//upload.*.jpg)\"") #.*src=\"(.*)\"</span") # Get the main photo if it exists
photo <- photoUrl[[1]][, 2][1]
photo <- gsub("jpg/.*", "@", photo) # If jpg extension has a slash after replace extension with @ symbol
photo <- gsub("@", "jpg", photo) # Replace @ symbol back with jpg
photo <- gsub("/thumb", "", photo) # Remove thumb directory to get actual image link
print(photo)
# Some Final Sanitizing for putting into SQL format
cleanCandidate <- gsub("\'", "\\\\\'", candidates[i]) # Escape ' character for sql
if (!is.na(bdaymatch[[1]][, 2][1])) {
bday <- paste("'",bdaymatch[[1]][, 2][1], "'", sep="")
} else {
bday <- "NULL"
}
if (!is.na(deathdaymatch[[1]][, 2][1])) {
dday <- paste("'",deathdaymatch[[1]][, 2][1], "'", sep="")
} else {
dday <- "NULL"
}
if (!is.na(photo)) {
photo <- paste("'",photo, "'", sep="")
} else {
photo <- "NULL"
}
# Now do the file writing
if (i == length(candidates)) {
write(sprintf("('%s', %s, %s, %s)", cleanCandidate, bday, dday, photo), candidateFile, append=TRUE)
} else {
write(sprintf("('%s', %s, %s, %s),", cleanCandidate, bday, dday, photo), candidateFile, append=TRUE)
}
}