-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_tools.py
122 lines (107 loc) · 3.62 KB
/
scrape_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 10 13:33:40 2021
@author: lando
This script contains tools for accessing sites and scraping data
"""
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from socket import timeout
from datetime import datetime
from datetime import timedelta
import re
import platform # For getting the operating system name
import subprocess # For executing a shell command
from time import sleep
# Decorator that will add the time a function is called to a list
def log_request(fn,log):
def wrapped_fn(*args,**kwargs):
value = fn(*args,**kwargs)
log.append(datetime.now())
return value
return wrapped_fn
# Returns True if host (str) responds to a ping request, otherwise False
def ping(host):
param = '-n' if platform.system().lower()=='windows' else '-c'
command = ['ping', param, '1', host] # Building the command. Ex: "ping -c 1 google.com"
return subprocess.call(command) == 0
# Return a BeautifulSoup object if successful, None if the site does
# not exist, or False if the request times out
def make_soup(url):
try:
site = urlopen(url, timeout = 20)
except (HTTPError, URLError) as error:
if isinstance(error.reason,timeout):
print(f"Connection timed out.")
return False
else:
print("Site does not exist.")
return None
except timeout:
print(f"Connection timed out.")
return False
html = site.read()
site.close()
html_soup = soup(html,'html.parser')
return html_soup
def get_states(url,start_state):
html_soup = make_soup(url)
html_soup = html_soup.findAll('div',{'id': 'home1'})[0]
links = []
switch = False
for state in html_soup.findAll('a'):
if state.text == start_state:
switch = True
if switch:
links.append(state['href'])
dc = 'http://www.city-data.com/city/District-of-Columbia.html'
st = 'http://www.city-data.com/smallTowns.html'
if dc in links:
links.remove(dc)
if st in links:
links.remove(st)
return links
def get_cities(url):
state_soup = make_soup(url)
state_soup = state_soup.find('table',{'id':'cityTAB'})
state_soup = state_soup.find('tbody')
city_names = []
for row in state_soup.findAll('tr'):
city = row.findAll('td')[1].text
just_city = re.compile('(?P<city>(\w\s?)*)(\,\s\w\w\b)?')
match = just_city.match(city)
city = match.group('city')
city_names.append(city)
return city_names
def scrape_city(url):
new_soup = make_soup(url)
if not bool(new_soup):
if new_soup is None:
return {}
elif not new_soup:
return False
else:
raise Exception("Something weird happened")
new_soup = new_soup.find('div',{'id':'content'})
sections = new_soup.findAll('section')
content = {}
for section in sections:
content.update({section['id']: [section.text]})
return content
def repent(mini,maxi,guess,url):
forgiven = False
pings = []
while True:
a = datetime.now() + timedelta(hours=guess)
print(f"Repenting until {a.hour}:{a.minute}")
sleep(60*60*guess)
forgiven = ping(url)
pings.append(datetime.now())
if not forgiven:
mini = guess
guess = (mini+maxi)/2
else:
maxi = guess
break
return mini, maxi, pings