-
Notifications
You must be signed in to change notification settings - Fork 0
/
vs.py
210 lines (160 loc) · 5.46 KB
/
vs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import sys
import os
import re
import urllib2
import urllib
import time
from sets import Set
from BeautifulSoup import BeautifulSoup
def main():
global site
global format
global base_path
base_path = "downloads/"
if not os.path.exists(base_path):
os.makedirs(base_path)
if not "http" in sys.argv[1]:
site = "http://" + sys.argv[1]
else:
site = sys.argv[1]
format = sys.argv[2]
print format
try:
runScraper(format)
except urllib2.HTTPError:
print "Are you sure you're online?"
# Grab the given url's html and make it pretty. Returns an object of type BeautifulSoup
def getPage(url):
if isRelativePath(url):
url = site + url
return BeautifulSoup(urllib2.urlopen(url).read())
def isRelativePath(path):
out_link = re.compile("http")
return not out_link.match(path)
# Pops a link off the list of links, returning only the first relative link
def chooseLink(links, regex):
return links.pop()
# Return all followable links.
# @NOTE: NYI
def removeExternalUrls(links):
urls = []
for link in links:
if isRelativePath(link):
urls.append(link)
return urls
def removeJavascript(urls):
return pruneListByRegex(urls, "javascript")
# Return all downloadable links which match our preferences.
# Preferences NYI
def getDownloadLinks(urls, file_format):
return getUrlsByRegex(urls, file_format)
def removeRelativeLocationUrls(urls):
return pruneListByRegex(urls, "comment")
def pruneListByRegex(a_list, format):
results = []
format = re.compile(format)
for item in a_list:
print item
candidate = parseFilename(item)
if candidate is not '' and not format.match(candidate):
results.append(item)
return results
def getUrlsByRegex(urls, format):
# print format
results = []
format = re.compile(format); #print urls
for url in urls:
candidate = parseFilename(url)
if format.findall(candidate):
results.append(url)
return results
# Return urls given anchor hyperlink reference tags.
def parseLinks(link_list):
urls = []
for html_link in link_list:
urls.append(parseLink(html_link))
return urls
# Individual helper method.
def parseLink(tag_link):
if tag_link.has_key("href"):
tag_link = tag_link['href']
url = tag_link.encode('ascii')
else:
url = ''
return url
# Remove all links which have already been downloaded.
# @NOTE: NYI
def removeDownloaded(links):
return links
# Remove all links which have already been downloaded.
# @NOTE: NYI
def removeFollowed(urls, followed):
s_urls = Set(urls)
s_followed = Set(followed)
s_urls.difference_update(s_followed)
return s_urls
# Download all of the files from the given links.
# @NOTE: NYI
def downloadFiles(fileNames, fileLinks):
if len(fileNames) > 1 :
for name,link in fileNames,fileLinks:
names = downloadFile(name, link)
else:
downloadFile(fileNames.pop(), fileLinks.pop())
# Open a "web file" and write it to the current directory.
# @TODO: autopopulate a directory structure and give options.
def downloadFile(write_to, read_from):
file = open(base_path + write_to, 'w')
print >> file, (site + read_from)
# Return filenames given urls.
def parseFilenames(links):
names = []
for link in links:
names.append(parseFilename(link))
return names
# Individual helper method.
def parseFilename(link):
filename = re.split("/", link).pop()
if filename is None:
filename = ''
return filename
def runScraper(file_format):
follow_links = []
print "Scraping " + site + " ..."
url = site;
follow_links.append(site)
followed = []
while (follow_links):
try:
#time.sleep(1)
# Grab the next link to follow
url = chooseLink(follow_links, "http"); print url
# Add url to followed before traversed: in case of exception.
followed.append(url); #print followed
# Grab the next page
cur_page = getPage(url); #print cur_page("a")
# Get a list of the links from the current page and add them to be traversed
all_page_links = cur_page("a");# print "Links from page grabbed."; #print all_page_links
all_page_urls = parseLinks(all_page_links); #print "Hyperlink anchors removed from potential links.";# print all_page_urls
all_page_urls = removeJavascript(all_page_urls)
all_page_urls = removeRelativeLocationUrls(all_page_urls)
internal_urls = removeExternalUrls(all_page_urls);# print "Purged external urls from potential links."; #print follow_links
follow_links.extend(removeFollowed(internal_urls, followed) );# print "Followed links removed from potential links. Potential links saved.";
# Check the current page for candidate download links. Use all_page_urls, because media hosting may be under
# a subdomain, or externally hosted.
download_links = getDownloadLinks(all_page_urls, file_format); print "Parsed candidate download links:"
if download_links:
filenames = parseFilenames(download_links); print "File names parsed."
# Remove all already downloaded files from the list
# Print each collision to stdout
download_links = removeDownloaded(download_links); print "Download collisions purged."
# Download all candidate links
downloadFiles(filenames, download_links); print "Files on current page downloaded."
else:
print "No suitable downloads found on this page."
print download_links
except urllib2.HTTPError:
print "Could not follow link."
## Recurse
print "All paths have been traversed."
main()