-
-
Notifications
You must be signed in to change notification settings - Fork 289
/
uukanshu_sj.py
103 lines (83 loc) · 3.66 KB
/
uukanshu_sj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
import logging
import re
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
novel_search_url = "%ssearch.aspx?k=%s"
chapter_list_url = "%s&page=%d"
class UukanshuOnlineSJ(Crawler):
base_url = ["https://sj.uukanshu.net/"] # previously .com, redirects .com to .net though
def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(novel_search_url % (self.home_url, query))
results = []
for data in soup.select("#bookList li"):
title = data.select_one(".book-title a.name")["title"]
author = data.select_one(".book-title .aut").get_text()
url = self.home_url + data.select_one(".book-title a.name")["href"]
results.append(
{
"title": title,
"url": url,
"info": f"Author: {author}",
}
)
return results
def read_novel_info(self):
soup = self.get_soup(self.novel_url)
self.novel_title = soup.select_one(".bookname").text.strip()
logger.info("Novel title: %s", self.novel_title)
possible_image = soup.select_one(".book-info img")
if possible_image:
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)
self.novel_author = (
soup.select_one(".book-info dd").text.replace("作者:", "").strip()
)
logger.info("Novel author: %s", self.novel_author)
logger.info("Getting chapters...")
soup = self.get_soup(chapter_list_url % (self.novel_url, 1))
try:
last_page = soup.select_one(".pages a:last-child")
page_count = int(re.findall(r"&page=(\d+)", str(last_page["href"]))[0])
except Exception as err:
logger.debug("Failed to parse page count. Error: %s", err)
page_count = 0
logger.info("Total pages: %d", page_count)
futures = [
self.executor.submit(self.get_soup, chapter_list_url % (self.novel_url, p))
for p in range(2, page_count + 1)
]
page_soups = [soup] + [f.result() for f in futures]
for soup in page_soups:
for a in soup.select("ul#chapterList li a"):
chap_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
if chap_id % 100 == 1:
self.volumes.append({"id": vol_id})
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text,
"url": self.home_url + a["href"],
}
)
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
body = soup.select_one("#bookContent")
content = self.cleaner.extract_contents(body)
return self.format_text(content)
@staticmethod
def format_text(text):
text = re.sub(
r"[UU][UU]\s*看书\s*[ww][ww][ww][\..][uu][uu][kk][aa][nn][ss][hh][uu][\..][cc][oo][mm]",
"",
text,
)
text = text.replace("章节缺失、错误举报", "")
text = text.replace("注:如你看到本章节内容是防盗错误内容、本书断更等问题请登录后→→", "")
text = text.replace("最新网址:", "")
text = text.replace("请记住本书首发域名:。手机版更新最快网址:", "")
text = text.replace("www.uukanshu.com", "")
return text