-
Notifications
You must be signed in to change notification settings - Fork 0
/
MultiThread.py
82 lines (72 loc) · 2.47 KB
/
MultiThread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
1242. Web Crawler Multithreaded
生产者和消费者模式
# """
# This is HtmlParser's API interface.
# You should not implement it, or speculate about its implementation
# """
# class HtmlParser(object):
# def getUrls(self, url):
# """
# :type url: str
# :rtype List[str]
# """
import queue
import threading
from urllib.parse import urlsplit
class Solution:
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
domain = urlsplit(startUrl).netloc
requestQueue = queue.Queue()
resultQueue = queue.Queue()
requestQueue.put(startUrl)
for _ in range(5):
t = threading.Thread(target=self._crawl,
args=(domain, htmlParser, requestQueue, resultQueue))
t.daemon = True
t.start()
running = 1
vis = set([startUrl])
while running > 0:
urls = resultQueue.get()
for url in urls:
if url in vis:
continue
vis.add(url)
requestQueue.put(url)
running += 1
running -= 1
return list(vis)
def _crawl(self, domain, htmlParser, requestQueue, resultQueue):
while True:
url = requestQueue.get()
urls = htmlParser.getUrls(url)
newUrls = []
for url in urls:
u = urlsplit(url).netloc
if u == domain:
newUrls.append(url)
resultQueue.put(newUrls)
class Solution:
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
from threading import Lock, Thread
def get_hostname(url: str) -> str:
return url.split('//', 1)[1].split('/', 1)[0]
def fetch(url: str) -> None:
for url in htmlParser.getUrls(url):
if get_hostname(url) == hostname:
with lock:
if url in visited:
continue
visited.add(url)
thread = Thread(target=fetch, args=(url,))
thread.start()
queue.append(thread)
hostname = get_hostname(startUrl)
lock = Lock()
visited = {startUrl}
main_thread = Thread(target=fetch, args=(startUrl,))
main_thread.start()
queue = deque([main_thread])
while queue:
queue.popleft().join()
return list(visited)