-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdead_bot.py
102 lines (83 loc) · 3.37 KB
/
dead_bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
"""
dead_bot.py
~~~~~~~~~~~~
Simple model to find dead links in a website
:copyright: (c) 2019 by Divine Sedem Tettey
:license: Apache2, see LICENSE for more details.
"""
import sys, urllib, time, json
from urllib import request, parse
from urllib.parse import urlparse, urljoin
from urllib.request import Request
from html.parser import HTMLParser
from collections import deque
search_attrs = set(['href','src'])
agents = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
class DeadLinks(HTMLParser):
def __init__(self, home):
super().__init__()
self.home = home
self.checked_links = set()
self.pages_to_check = deque()
self.pages_to_check.appendleft(home)
self.dead_links = {}
self.crawler()
def crawler(self):
""" function to crawl the pages """
#while there are still pages to crawl
while self.pages_to_check:
page = self.pages_to_check.pop()
# send the request using this header
req = Request(page, headers={'User-Agent': agents})
try:
res = request.urlopen(req)
except:
continue
# check to make sure its html that we are bout to parse
if 'html' in res.headers['content-type']:
with res as f:
#reading the html using UTF-8 as encoding
body = f.read().decode('utf-8', errors='ignore')
self.feed(body)
#get current time to name file
file_name = time.strftime("%Y%m%d%H%M%S")+".json"
#write to json file
self.write_result_to_json(file_name, self.dead_links)
def handle_starttag(self, tag, attrs):
"""would handle the start tags in html body"""
for attr in attrs:
if(attr[0] in search_attrs and attr[1] not in self.checked_links):
self.checked_links.add(attr[1])
self.handle_link(attr[1])
def handle_link(self, link):
""" would handle the links """
#check for relative link
if not bool(urlparse(link).netloc):
#lets fix this, we cant send a request
link = urljoin(self.home, link)
#now lets attempt to send a request,watching out for the http status code
try:
req = Request(link , headers={'User-Agent': agents})
status = request.urlopen(req).getcode()
except urllib.error.HTTPError as err:
self.dead_links[link] = err.code
print(f"HTTPError: {err.code} - {link}")
except urllib.error.URLError as err:
#self.dead_links[link] = err.reason
print(f"URLError: {err.reason} - {link}")
except:
print("Unexpected Error: ", sys.exc_info()[0])
else:
print(f'{status} -{link}')
if self.home in link:
self.pages_to_check.appendleft(link)
def write_result_to_json(self, file_name, result):
f = open(file_name, 'w')
json_file = json.dumps(result)
print(f"creating file: {file_name}...")
print("writing to file...")
f.write(json_file)
print('...write to file complete')
f.close()
DeadLinks(sys.argv[1])