-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgodwin_finder_trio.py
122 lines (95 loc) · 3.48 KB
/
godwin_finder_trio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import trio
from random import random
import sys
from easy_timing import timer
import argparse
from bs4 import BeautifulSoup
import asks
import time
import urllib.parse
KEYWORDS = ("Nazi", "Hitler", "Nazie")
# KEYWORDS = ("defzdeazefdezfzedezzf", "ezerzrerfez")
MAX_WORKERS = 4
parser = argparse.ArgumentParser(description="Search wikipedia for godwin point")
parser.add_argument("start_page", help="Wikipedia start page")
parser.add_argument(
"-d", "--depth", help="Depth of the research", action="store", default=2
)
args = parser.parse_args()
depth_max = int(args.depth)
MAX_TASKS = 100
MAX_WORKERS = 10
task_completed = 0
task_queued = 0
def print_ident(string, ident):
print(ident.count(".") * " " + string)
async def task(nursery, lock, task_id, page):
"""Task
Arguments:
task_id {string} --
"""
global task_completed, task_queued
async with lock:
page_decoded = urllib.parse.unquote(page)
print_ident(
f"Task #{task_id:10} - Begin - Page {page_decoded}",
task_id,
)
html = await asks.get("https://fr.wikipedia.org" + page)
soup = BeautifulSoup(html.content, "html.parser")
text = soup.get_text()
if any(s in text for s in KEYWORDS):
print_ident(
f"Task #{task_id:10} - Godwin point FOUND - Page {page_decoded}", task_id
)
# To do: cancellation
nursery.cancel_scope.cancel()
else:
print_ident(
f"Task #{task_id:10} - Godwin not found - Page {page_decoded:30} ",
task_id,
)
if task_id.count(".") < depth_max:
links = []
for link in soup.find(id="content").find_all("a"):
if (
str(link.get("href")).startswith("/wiki/")
and not str(link.get("href")).startswith("/wiki/Fichier:")
and not str(link.get("href")).endswith("Projet:Accueil")
and not str(link.get("href")).startswith("/wiki/Portail:")
and not str(link.get("href")).startswith("/wiki/Cat%C3%A9gorie:")
and not str(link.get("href")).startswith("/wiki/Aide:")
and not str(link.get("href")).endswith("homonymie)")
):
links.append(link.get("href"))
print_ident(
f"Task #{task_id:10} - {len(links)} links found - Page {page_decoded:30} ",
task_id,
)
for i, link in enumerate(links, 1):
nursery.start_soon(task, nursery, lock, task_id + "." + str(i), link)
task_queued += 1
task_completed += 1
print_ident(
f"Task #{task_id:10} - End task ({task_queued - task_completed} in queue, {task_completed} completed) - Page {page_decoded}",
task_id,
)
async def main():
global task_queued
print("Begin parent")
with trio.move_on_after(20):
async with trio.open_nursery() as nursery:
lock = trio.CapacityLimiter(MAX_WORKERS)
print("Begin nursery")
await task(nursery, lock, "1", "/wiki/" + args.start_page)
task_queued += 1
print("Waiting for children")
print("End parent")
if __name__ == "__main__":
try:
asks.init("trio")
with timer(""):
trio.run(main)
except KeyboardInterrupt:
print("\n Bye bye.")
sys.exit(0)