-
-
Notifications
You must be signed in to change notification settings - Fork 289
/
69shuba.py
137 lines (117 loc) · 5.22 KB
/
69shuba.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
import logging
from bs4 import Tag
from lncrawl.core.crawler import Crawler
import urllib.parse
from lncrawl.models import Volume, Chapter
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:101.0) Gecko/20100101 Firefox/101.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9,de-CH;q=0.8,de;q=0.7",
"Cache-Control": "no-cache",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "https://www.69shu.pro",
"DNT": "1",
"Referer": "https://www.69shu.pro/modules/article/search.php",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Opera GX";v="106"',
"Sec-Ch-Ua-Platform": "Windows",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
}
logger = logging.getLogger(__name__)
search_url = "https://www.69shu.pro/modules/article/search.php" # Updated to the new domain
class sixnineshu(Crawler):
base_url = [
"https://www.69shuba.com/",
"https://www.69shu.com/",
"https://www.69xinshu.com/",
"https://www.69shu.pro/",
"https://www.69shuba.pro/"
]
def initialize(self):
# the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
self.init_parser("html.parser")
self.init_executor(ratelimit=20)
def search_novel(self, query):
query = urllib.parse.quote(query.encode("gbk"))
data = f"searchkey={query}&submit=Search"
soup = self.post_soup(
search_url,
headers=headers,
data=data,
encoding="gbk",
# cookies=self.cookies2,
)
results = []
for novel in soup.select("div.newbox ul li"):
results.append(
{
"title": novel.select_one("h3 a:not([imgbox])").text.title(),
"url": self.absolute_url(novel.select_one("a")["href"]),
"info": "Latest: %s" % novel.select_one("div.zxzj p").text,
}
)
return results
def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url, encoding="gbk")
possible_title = soup.select_one("div.booknav2 h1")
assert possible_title, "No novel title"
self.novel_title = possible_title.text.strip()
logger.info("Novel title: %s", self.novel_title)
possible_image = soup.select_one("div.bookimg2 img")
if isinstance(possible_image, Tag):
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)
possible_author = soup.select_one('.booknav2 p a[href*="authorarticle"]')
if isinstance(possible_author, Tag):
self.novel_author = possible_author.text.strip()
logger.info("Novel Author: %s", self.novel_author)
possible_synopsis = soup.select_one("div.navtxt p")
if isinstance(possible_synopsis, Tag):
self.novel_synopsis = possible_synopsis.text.strip()
logger.info("Novel Synopsis: %s", self.novel_synopsis)
# Only one category per novel on this website
possible_tag = soup.select_one('.booknav2 p a[href*="top"]')
if isinstance(possible_tag, Tag):
self.novel_tags = [possible_tag.text.strip()]
logger.info("Novel Tag: %s", self.novel_tags)
# https://www.69shuba.com/txt/A43616.htm -> https://www.69shuba.com/A43616/
soup = self.get_soup(self.novel_url.replace("/txt/", "/").replace(".htm", "/"), encoding="gbk")
# manually correct their false chapter identifiers if need be
correction = 0
for idx, li in enumerate(soup.select("div#catalog ul li")):
chap_id = int(li["data-num"])
if idx == 0:
# 1-2 = -1; 1-1 = 0; 1 - 0 = +1
correction = 1 - chap_id
chap_id += correction
vol_id = len(self.chapters) // 100 + 1
if len(self.chapters) % 100 == 0:
self.volumes.append(Volume(vol_id))
a = li.select_one("a")
if not a:
# this should not occur with html.parser, if it does, likely due to parser/encoding issue
logger.warning("Failed to get Chapter %d! Missing Link", chap_id)
continue
self.chapters.append(
Chapter(
chap_id,
url=self.absolute_url(a["href"]),
title=li.text.strip(),
volume=vol_id,
)
)
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url, encoding="gbk")
contents = soup.select_one("div.txtnav")
contents.select_one("h1").decompose()
contents.select_one("div.txtinfo").decompose()
contents.select_one("div#txtright").decompose()
return self.cleaner.extract_contents(contents)