-
Notifications
You must be signed in to change notification settings - Fork 3
/
linkcheck.py
executable file
·68 lines (51 loc) · 2.09 KB
/
linkcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
'''
linkcheck - scours old posts for bad domains (et al)
Copyright Ⓒ Oli Warner, 2018
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
Depends on Python 3.6 with requests and tqdm libraries.
sudo apt install python3-{requests,tqdm}
python3 linkcheck.py /path/to/dump/dir
'''
import requests
from multiprocessing import Pool
import sys
import tqdm
from pathlib import Path
from functools import partial
def process_file(filepath, pool, post_id_chunk):
with filepath.open() as source_file:
# count lines first
lines = sum(1 for line in source_file)
source_file.seek(0)
for _ in tqdm.tqdm(pool.imap_unordered(partial(process_line, post_id_chunk=post_id_chunk), source_file), total=lines, desc=filepath.name):
pass
def process_line(line, post_id_chunk):
line = line.lower()
if 'http' not in line:
return
for bad_domain in bad_domains:
if bad_domain not in line:
continue
# get the post ID - this is the int in the post_id_chunk-th chunk
raw_post_id = line.split(maxsplit=post_id_chunk+1)[post_id_chunk]
post_id = ''.join(filter(lambda c: c.isdigit(), raw_post_id))
print(f'https://askubuntu.com/q/{post_id}/\t{bad_domain}')
if __name__ == "__main__":
try:
dumpdir = Path(sys.argv[1])
if not (dumpdir.exists() and dumpdir.is_dir() and (dumpdir / 'Posts.xml').exists()):
raise IndexError()
except IndexError:
sys.stderr.write('Missing path to dump dir!\n')
sys.exit(1)
# Read in the blacklisted_websites
# This is modified from the Smoke Detector
with (Path(__file__).parent / 'smokey/blacklisted_websites.txt').open() as f:
bad_domains = f.read().lower().splitlines()
pool = Pool()
process_file(dumpdir / 'Posts.xml', pool, post_id_chunk=1)
process_file(dumpdir / 'Comments.xml', pool, post_id_chunk=2)