-
Notifications
You must be signed in to change notification settings - Fork 47
/
scrape.py
131 lines (111 loc) · 4.9 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup, Comment
import urllib
import os
import shutil
import string
import re
URL='http://www.diveintopython.net/'
GOOGLE_ANALYTICS_KEY = 'UA-9740779-18'
def scrape():
try:
p = open('save/dip.html', 'r')
soup = BeautifulSoup(p.read())
except IOError, e:
print "io error code: %d msg: %s" % (e.returncode, e.message)
return None
for i in soup.findAll('a'):
if i.has_key('href'):
if i['href'][0:4] == 'http' and '#' not in i['href']:
try:
filename = i['href'].split('/')[-2] + '/' + i['href'].split('/')[-1]
print "saving %s into %s" % (i['href'], filename, )
if not os.path.exists(i['href'].split('/')[-2]):
os.mkdir(i['href'].split('/')[-2])
with open(filename, 'w') as out:
out.write(urllib.urlopen(i['href']).read())
except IOError, e:
pass
def purify(filename):
with open(filename, 'r') as f:
soup = BeautifulSoup(f)
print "working on %s" % (filename, )
for div in soup.findAll('div'):
if div.has_key('id'):
if div['id'] == 'wm-ipp':
div.extract()
for script in soup.findAll('script'):
script.extract()
for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
comment.extract()
for link in soup.findAll('link'):
if link.has_key('rev'):
if link['rev'] == 'made':
link['href'] = 'josh@servercobra.com'
if link.has_key('rel'):
if link['rel'] == "home":
link['href'] = URL
if link['rel'] == "stylesheet":
link['href'] = "/css/diveintopython.css"
if link['rel'] == "next" or link['rel'] == "up" or link['rel'] == "previous":
link['href'] = URL + '/'.join(link['href'].split('/')[8:])
for a in soup.findAll('a'):
if a.has_key('href'):
if 'http://web.archive.org/' in a['href']:
print "print cleaning up link: %s" % (a['href'])
a['href'] = URL + '/'.join(a['href'].split('/')[8:])
if 'mailto:' in a['href']:
a['href'] = 'mailto:josh@servercobra.com'
#a['href'] = 'http://www.diveintopython.net/' a['href'].split('/')[8:]
#if 'http://diveintopython.net/' in a['href']:
for form in soup.findAll('form'):
if form.has_key('action'):
if 'http://web.archive.org/' in form['action']:
form['action'] = 'http://www.google.com/' + '/'.join(form['action'].split('/')[8:])
for img in soup.findAll('img'):
if img.has_key('src'):
if 'http://web.archive.org/' in img['src']:
img['src'] = URL + '/'.join(img['src'].split('/')[8:])
#TODO: insert Google Analytics
#soup.head.insert(len(a.head.contents), '<!-- comment -->')
# Insert Google Analytics Async Tracking Code
code = '''<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', '%s']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>''' % (GOOGLE_ANALYTICS_KEY, )
if GOOGLE_ANALYTICS_KEY not in soup.head.contents:
soup.head.insert(len(soup.head.contents), code)
new_soup = BeautifulSoup(soup.renderContents())
for i in new_soup.findAll('a'):
if i.has_key('href'):
if i['href'][0:4] == 'http':
#print i['href']
pass
with open(filename, 'w') as out:
out.write(new_soup.renderContents())
#def replace_url(old, new):
#for file in os.listdir('/home/josh/programming/diveintopython'):
#if os.path.isdir(file):
#directory = file
#for f in os.listdir(file):
#if 'html' in f:
#with open(directory + '/' + f, 'w+') as f2:
#text = f2.read()
#f2.write(re.sub('http://diveintopython.net', 'http://www.diveintopython.net', text))
if __name__ == '__main__':
#for f in os.listdir('/home/josh/programming/diveintopython/'):
#if ".html" in f.name:
#purify(f)
#purify('save/redhat.html')
for file in os.listdir('/home/josh/programming/diveintopython'):
if os.path.isdir(file):
directory = file
for f in os.listdir(file):
if 'html' in f:
purify(directory + '/' + f)
#replace_url(None, None)