-
Notifications
You must be signed in to change notification settings - Fork 4
/
example_page_crawler.py
108 lines (80 loc) · 2.46 KB
/
example_page_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Simple scraping script.
- reads site map, reads links
- follows each link t read more links
- etc. etc.
"""
import socket
import json
import traceback
from webtools import (
PageOptions,
WebConfig,
WebLogger,
DomainCache,
Url,
ContentLinkParser,
run_server_task,
)
__version__ = "0.0.1"
class Crawler(object):
pages = {}
def crawl(self, url):
# start by crawling the link itself
self.add(url)
# crawl main domain
domain_cache = DomainCache.get_object(url)
self.add(domain_cache.url)
# start by site maps
site_maps_urls = domain_cache.get_site_maps_urls()
for site_map_url in site_maps_urls:
self.add(site_map_url)
self.add(url)
while self.process_urls():
pass
def add(self, url):
if url not in Crawler.pages:
print("Added:{}".format(url))
Crawler.pages[url] = {}
def process_urls(self):
url = self.get_next_to_crawl()
if url:
response = self.process_url(url)
if response and response.is_valid():
text = response.get_text()
if text:
parser = ContentLinkParser(url, text)
links = parser.get_links()
for link in links:
self.add(link)
Crawler.pages[url]["response"] = response
return True
return False
def get_next_to_crawl(self):
for url in Crawler.pages:
page_data = Crawler.pages[url]
if len(page_data) == 0:
return url
def process_url(self, url):
print("Scraping:{}".format(url))
options = PageOptions()
options.use_headless_browser = False
options.use_full_browser = False
url = Url(url = url, page_options = options)
handler = url.get_handler()
response = url.get_response()
return response
def main():
WebConfig.init()
WebConfig.use_print_logging()
# more advanced processing is possible through other frameworks
server = run_server_task()
WebConfig.crawling_server_port = server.port
WebConfig.crawling_full_script = "poetry run python crawleebeautifulsoup.py"
WebConfig.crawling_headless_script = "poetry run python crawleebeautifulsoup.py"
print("Enter page to crawl")
url = input("->")
c = Crawler()
c.crawl(url)
if __name__ == "__main__":
main()