-
-
Notifications
You must be signed in to change notification settings - Fork 300
/
lnmtl.py
164 lines (139 loc) · 5.43 KB
/
lnmtl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# -*- coding: utf-8 -*-
import logging
import re
from concurrent import futures
import js2py
from bs4 import BeautifulSoup
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
login_url = "https://lnmtl.com/auth/login"
logout_url = "https://lnmtl.com/auth/logout"
class LNMTLCrawler(Crawler):
has_mtl = True
base_url = "https://lnmtl.com/"
def login(self, email, password):
"""login to LNMTL"""
# Get the login page
logger.info("Visiting %s", login_url)
soup = self.get_soup(login_url)
token = soup.select_one('form input[name="_token"]')["value"]
# Send post request to login
logger.info("Logging in...")
response = self.submit_form(
login_url,
data=dict(
_token=token,
email=email,
password=password,
),
)
# Check if logged in successfully
soup = BeautifulSoup(response.content, "lxml")
if soup.select_one('a[href="%s"]' % logout_url):
print("Logged in")
else:
body = soup.select_one("body").text
logger.debug("-" * 80)
logger.debug(
"\n\n".join([x for x in body.split("\n\n") if len(x.strip()) > 0])
)
logger.debug("-" * 80)
logger.error("Failed to login")
def logout(self):
"""logout as a good citizen"""
logger.debug("Logging out...")
soup = self.get_soup(logout_url)
if soup.select_one('a[href="%s"]' % logout_url):
logger.error("Failed to logout")
else:
print("Logged out")
def read_novel_info(self):
"""get list of chapters"""
logger.info("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)
title = soup.select_one(".novel .media .novel-name").text
self.novel_title = title.rsplit(" ", 1)[0]
logger.debug("Novel title = %s", self.novel_title)
try:
self.novel_cover = self.absolute_url(
soup.find("img", {"title": self.novel_title})["src"]
)
except Exception:
pass # novel cover is not so important to raise errors
logger.info("Novel cover = %s", self.novel_cover)
self.parse_volume_list(soup)
self.volumes = sorted(self.volumes, key=lambda x: x["id"])
logger.info("Getting chapters...")
self.download_chapter_list()
def parse_volume_list(self, soup):
self.volumes = []
script = soup.find(name="main").find_next_sibling(name="script").string
try:
data = js2py.eval_js(
"(function() {" + script + "return window.lnmtl;})()"
).to_dict()
for i, vol in enumerate(data["volumes"]):
title = vol.get("title", "") or ""
title = re.sub(r"[^\u0000-\u00FF]", "", title)
title = re.sub(r"\(\)", "", title).strip()
self.volumes.append(
{
"id": i + 1,
"title": title,
"download_id": vol["id"],
}
)
except Exception:
logger.exception("Failed parsing one possible batch")
if len(self.volumes) == 0:
raise Exception("Failed parsing volume list")
def download_chapter_list(self):
futures_to_wait = [
self.executor.submit(self.download_chapters_per_volume, volume)
for volume in self.volumes
]
possible_chapters = {}
for future in futures.as_completed(futures_to_wait):
vol_id, chapters = future.result()
possible_chapters[vol_id] = chapters
for volume in self.volumes:
for chapter in possible_chapters[volume["id"]]:
chap = chapter.copy()
chap["id"] = len(self.chapters) + 1
chap["volume"] = volume["id"]
self.chapters.append(chap)
def download_chapters_per_volume(self, volume, page=1):
url = self.absolute_url(
f"/chapter?page={page}&volumeId={volume['download_id']}"
)
logger.info("Getting json: %s", url)
result = self.get_json(url)
chapters = []
for chapter in result["data"]:
title = chapter.get("title") or ""
if chapter.get("number"):
title = f"#{chapter.get('number')} {title}"
chapters.append(
{
"title": title,
"url": chapter["site_url"],
}
)
if page != 1:
return chapters
for page in range(2, result["last_page"] + 1):
chapters += self.download_chapters_per_volume(volume, page)
return volume["id"], chapters
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
body = soup.select(".chapter-body .translated")
body = [self.format_text(x.text) for x in body if x]
body = "\n".join(["<p>%s</p>" % (x) for x in body if len(x)])
return body.strip()
def format_text(self, text):
"""formats the text and remove bad characters"""
text = text.replace("\u00ad", "")
text = re.sub(r"\u201e[, ]*", "“", text)
text = re.sub(r"\u201d[, ]*", "”", text)
text = re.sub(r"[ ]*,[ ]+", ", ", text)
return text.strip()