-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathswatch_scrape.py
68 lines (56 loc) · 1.85 KB
/
swatch_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import lxml.html as html
import lxml.etree as etree
from tornado.httpclient import AsyncHTTPClient
from hashlib import sha1
import mimetypes
from tornado.ioloop import IOLoop
AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
def sitemap(path='sitemap.xml'):
tree = etree.parse(open(path, 'r'))
return tree.xpath(
'//sitemap:loc/text()',
namespaces={'sitemap':'http://www.sitemaps.org/schemas/sitemap/0.9'})
def get_image(product_page):
tree = html.document_fromstring(product_page.body)
tags = tree.xpath('//p[@class="product-image"]/img/@src')
return tags[0]
crawl_queue = []
running_reqs = 0
class Request(object):
def __init__(self):
global running_reqs
try:
self.page_url = crawl_queue.pop()
except IndexError:
return
print '> %s' % self.page_url
running_reqs += 1
AsyncHTTPClient().fetch(self.page_url, self.got_url)
def got_url(self, page):
try:
self.image_url = get_image(page)
except IndexError:
self.finish()
return
print self.image_url
AsyncHTTPClient().fetch(self.image_url, self.got_image)
def got_image(self, response):
data = response.body
basename = sha1(response.body).hexdigest()
mime = response.headers.get('Content-Type', 'image/jpeg')
suffix = mimetypes.guess_extension(mime)
open('images/%s%s' % (basename, suffix), 'w').write(response.body)
self.finish()
def finish(self):
global running_reqs
running_reqs -= 1
if running_reqs < 200 and len(crawl_queue) > 0:
Request()
print '< %s' % self.page_url
def run():
crawl_queue.extend(sitemap())
for i in xrange(10):
Request()
IOLoop.instance().start()
if __name__ == '__main__':
run()