-
Notifications
You must be signed in to change notification settings - Fork 2
/
wiki.py
119 lines (103 loc) · 3.63 KB
/
wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import urllib2
import urllib
import os
import re
import cache
import encode
import lxml.html
import lxml
def fec (f_link,obj):
match= re.search("http:\/\/herndon1\.sdrdc\.com\/cgi-bin\/can_detail\/(.*)$", f_link)
if (match):
val = match.group(1)
# print "extract fec"
obj['links']['fec']=val
def cspan (f_link,obj):
match= re.search("http:\/\/c-spanvideo\.org\/person\/(.*)$", f_link)
if (match):
val = match.group(1)
if (re.search(val,"\d+")):
print "cspan numeric" , val
obj['links']['cspan']=val
else:
print "cspan string" , val
obj['links']['cspan']=val
return
match= re.search("http:\/\/c-spanvideo\.org\/(.*)$", f_link)
if (match):
val = match.group(1)
print "cspan short" , val
obj['links']['cspan']=val
def ballot (f_link,obj):
match= re.search("http:\/\/ballotpedia\.org\/wiki\/index\.php\/(.*)$", f_link)
if (match):
val = match.group(1)
obj['links']['ballot']=val
def opencongress (f_link,obj):
match= re.search("http://www.opencongress.org/people/show/(.*)$", f_link)
if (match):
val = match.group(1).upper()
obj['links']['opencong']=val
def congbio (f_link,obj):
match= re.search("http:\/\/bioguide.congress.gov\/scripts\/biodisplay\.pl\?index\=(.*)$", f_link)
if (match):
val = match.group(1).upper()
obj['links']['bioguide']=val
def votesmart (f_link,obj):
match= re.search("http:\/\/www.votesmart.org\/candidate\/(\d+)$", f_link)
if (match):
val = match.group(1)
obj['links']['votesmart']=val
def wikipedia (f_link,obj):
#http:\/\/en\.wikipedia\.org\/wiki\/Aaron_Schock
match= re.search("http:\/\/en\.wikipedia\.org\/wiki\/(.+)$", f_link)
if (match):
val = match.group(1)
obj['links']['wikipedia']=val
def govhomepage(f_link,obj):
if (re.search("http:.*gov/$", f_link)):
""" based on the link, point to the object, we should be able to merge data sets based on the homepage """
obj['links']['homepage'][f_link]= obj
def parse_wiki_page_links(d,reps,obj):
for (f_name_element, attr , f_link, pos) in d.iterlinks():
if(attr == 'href'):
opencongress(f_link,obj)
ballot(f_link,obj)
congbio(f_link,obj)
votesmart(f_link,obj)
govhomepage(f_link,obj)
wikipedia(f_link,obj)
cspan(f_link,obj)
fec(f_link,obj)
return obj
def parse_wiki_page(x,reps,obj):
d = cache.cachewp ('http://en.wikipedia.org%s?action=purge&printable=yes' % x)
html = lxml.html.document_fromstring( d )
return parse_wiki_page_links(html,reps,obj)
def parse_wiki_text(d,reps) :
matches= re.finditer("{{CongLinks(.+)}}", d)
d = {}
d["raw"]=""
for match in matches :
val = match.group(1)
val = val.replace("&newMem=Y","")
val = val.replace("&newmem=Y","")
for x in val.split("|"):
try :
if (x.find("=") > 0):
(k,v) = x.split("=")
k = k.replace(" ","")
k = k.replace("\'","")
v=v.strip(" ")
v=v.rstrip(" ")
v=v.strip("'")
v=v.rstrip("'")
d[k]=v
except Exception,e :
print "error1",x, e, val
d["raw"]= d["raw"] + val
return d
def parse_wiki_source(x,reps):
url='http://en.wikipedia.org/w/index.php?title=%s&action=raw' % x
d = cache.cachewp (url)
return parse_wiki_text(d,reps)