-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo.py
95 lines (77 loc) · 2.99 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import aiohttp
import asyncio
import re
import time
# This code is for demonstration of the following:
# async def
# async with
# asnyc for
# async generators
# There are better ways to organize this code!
# For instance, task.add_done_callback() would allow me to schedule
# the regex directly after the HTTP get finishes. This would save a lot of
# lines and probably be a bit faster. The point of the demo is to show the
# speed gains and provide an example of the task/event loop model, so I
# elected not to dive too deep.
pages = [
"https://www.wuxiaworld.com/novel/against-the-gods",
"https://www.wuxiaworld.com/novel/demon-hunter",
"https://www.wuxiaworld.com/novel/martial-god-asura",
"https://www.wuxiaworld.com/novel/monarch-of-evernight",
]
# Takes an iterator and wraps it in another async iterator.
# This is for example only and is awful, don't do this.
async def iter_to_aiter(i):
while True:
try:
yield(next(i))
except StopIteration:
break
# Coroutine that gets a webpage. Note the use of "async with", which can handle
# the future object created by session.get to properly close the connection.
async def get_webpage(session, page):
async with session.get(page) as response:
print("Getting " + str(response.url))
# Await a response before returning.
return await response.text()
# Coroutine that handles our regular expression search.
async def get_chapters(task):
result = []
expr = r'<span>((\w|[\-\*\.\?\:]|\s)*)</span>'
# Immediately stop execution and await our task.
await task
print("Scrape completed, begining RegEx...")
async for match in iter_to_aiter(re.finditer(expr, task.result())):
result.append(match.group(1))
return result
# This is the main code that will queue up all of the task objects,
# then it will await their results from our little get_webpage > get_chapters
# pipeline we've set up, then print a little sample of the results
# to show it's working.
async def wuxia_scrape_chapters(pagelist):
async with aiohttp.ClientSession() as session:
pagetasks = []
for page in pagelist:
pagetasks.append(
asyncio.ensure_future(get_webpage(session, page))
)
print("Pages are being scraped...")
# Make more tasks to handle the result of the above tasks!
# This is executing concurrently while we're waiting for the pages
# to be scraped!
chapters = []
for task in pagetasks:
chapters.append(
asyncio.ensure_future(get_chapters(task))
)
print("Chapter routine initialized...")
for ch in chapters:
await ch
print("Next Book: ")
print(ch.result()[0:2]) # prints the first few chapters
# Enter the event loop
loop = asyncio.get_event_loop()
print("Loop created...")
t = time.clock()
loop.run_until_complete(wuxia_scrape_chapters(pages))
print(time.clock() - t)