-
Notifications
You must be signed in to change notification settings - Fork 0
/
linear.py
42 lines (31 loc) · 1 KB
/
linear.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re
import requests
import time
pages = [
"https://www.wuxiaworld.com/novel/against-the-gods",
"https://www.wuxiaworld.com/novel/demon-hunter",
"https://www.wuxiaworld.com/novel/martial-god-asura",
"https://www.wuxiaworld.com/novel/monarch-of-evernight",
]
def get_webpage(page):
response = requests.get(page)
print("Getting " + str(response.url))
return response.text
def get_chapters(pagetext):
print("Scrape completed, begining RegEx...")
result = []
expr = r'<span>((\w|[\-\*\.\?\:]|\s)*)</span>'
result = [match.group(1) for match in re.finditer(expr, pagetext)]
return result
def wuxia_scrape_chapters(pagelist):
chapters = []
for page in pagelist:
print("Scraping next page...")
chapters.append(get_chapters(get_webpage(page)))
for ch in chapters:
print("Next Book: ")
print(ch[:2]) # prints the first few chapters
print("Scraping...")
t = time.clock()
wuxia_scrape_chapters(pages)
print(time.clock() - t)