-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_unique_hostnames.py
executable file
·41 lines (33 loc) · 1.02 KB
/
get_unique_hostnames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python3
"""
Scan an arbitrary block of text for http/https URLs and extract, sort, and
deduplicate the hostnames. If big_list.csv or ignore.txt exist, any
hostnames in those will be removed from the final output.
"""
import csv
import re
import sys
hostname_regex = re.compile(r"https?://([\w\.-]+)")
print("Paste text block to be converted, then press Ctrl+D:", file=sys.stderr)
text = sys.stdin.read()
hostnames = hostname_regex.findall(text)
hostnames = map(lambda x: x.lower(), hostnames)
hostnames = set(hostnames)
try:
with open("big_list.csv") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
hostnames.discard(row["hostname"])
except IOError:
pass
try:
with open("ignore.txt") as ignore:
for line in ignore:
hostname = line.split("#", maxsplit=1)[0].strip()
if hostname != "":
hostnames.discard(hostname.lower())
except IOError:
pass
hostnames = sorted(hostnames)
for hostname in hostnames:
print(hostname)