-
Notifications
You must be signed in to change notification settings - Fork 1
/
webScrape.py
36 lines (25 loc) · 1 KB
/
webScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from lxml import html
import requests
import urllib
import os
page = requests.get('http://www.tenth.org/about/staff')
tree = html.fromstring(page.content)
staffList = tree.xpath("//div[@id='all-staff']//a[@class='person']")
for person in staffList:
name = person.xpath("header/h1/text()")[0]
image = person.xpath('@style')
if len(image) == 0: # if there's no image, there's nothing else for us here.
continue
image = image[0][22:-1]
names = name.split(' ')
# some folks have titles in their names, so let's pull those.
if names[0] == 'Dr.' or names[0] == 'Rev.':
user = names[1][0] + names[2]
else:
user = names[0][0] + names[1]
user = str.lower(user)
# delete existing, then download new image
if os.path.exists("rawImages/" + user + ".jpg"):
os.unlink("rawImages/" + user + ".jpg")
urllib.urlretrieve(image, "rawImages/" + user + ".jpg") # somehow, this seems to also convert PNGs to JPGs?
print "Downloaded image for " + user