-
-
Notifications
You must be signed in to change notification settings - Fork 300
/
nyxtranslation.py
140 lines (121 loc) · 5.47 KB
/
nyxtranslation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
import logging
import re
from bs4.element import Tag
from lncrawl.core.crawler import Crawler
from lncrawl.models import Volume, Chapter
logger = logging.getLogger(__name__)
volume_aliases = {"volume", "arc", "series", "saga", "chronicle", "tome", "storyline"}
class NYXTranslation(Crawler):
base_url = ["https://nyx-translation.com/", "https://nyxtranslation.home.blog/"]
has_manga = False
has_mtl = False
def initialize(self):
self.cleaner.bad_tags.add("script")
self.cleaner.bad_tags.add("a")
def read_novel_info(self):
soup = self.get_soup(self.novel_url)
content = soup.select_one("main#main > article")
entry_title = content.select_one("h1.entry-title")
assert isinstance(entry_title, Tag)
self.novel_title = entry_title.text.strip()
pre_tags = content.find("strong", text=re.compile(r"Genre.*:.*"))
if pre_tags:
tags = pre_tags.find_next_sibling(None, text=re.compile(r"\w+,"))
for tag in tags.split(", "):
self.novel_tags.append(tag)
pre_author = content.find("strong", text=re.compile(r"Author.*:?.*"))
if pre_author:
maybe_author = pre_author.next_sibling
author = maybe_author
if ": " in maybe_author.text:
author = maybe_author.next_sibling
self.novel_author = author.text
cover = content.select_one("img") # first image is the novel cover
if cover:
src = str(cover['src'])
# may be replaced with JS after load, in such case try and get the real img hidden in data-values
if src.startswith("data:"):
try:
src = cover["data-orig-file"]
except KeyError:
pass
self.novel_cover = self.absolute_url(src)
description = ""
description_start = content.find("p", text="Description")
d_next = description_start.next_sibling
while True:
if not isinstance(d_next, Tag):
d_next = d_next.next_sibling
continue
if "Alternative Name(s)" in d_next.next_sibling or d_next.name != "p":
break
description += d_next.text + "\n"
d_next = d_next.next_sibling
self.novel_synopsis = description
# "inconsistency is key" - the site author, probably... (s is optional)
chapters_start = content.find("p", text=re.compile(r"Table of Contents?", re.IGNORECASE))
c_next = chapters_start.next_sibling
chap = ""
while c_next:
if not isinstance(c_next, Tag):
c_next = c_next.next_sibling
continue
# there are some aria-hidden spacing divs within the chapter list
# also skip text-emtpy elements
if (c_next.name == "div" and c_next.has_attr("aria-hidden")) or not c_next.text:
c_next = c_next.next_sibling
continue
links = c_next.find_all("a")
if not links:
if self.is_volume(c_next.text):
logger.info("Found a volume: %s", c_next.text)
self.volumes.append(
Volume(
id=len(self.volumes) + 1,
title=c_next.text.strip().replace(":", ""),
)
)
else:
# these are all elements (except the spacer div) that shouldn't appear -> it should be done
if c_next.name in ["div", "script", "footer"]:
break
chap = c_next.text # would be a chapter title
else:
for link in links:
href = str(link["href"])
if not self.on_site(href):
logger.info("Found external link, assuming lazy structure, link: %s", href)
c_next = chapters_start.parent.next_sibling
break # break out of for loop in this case.
if not re.match(re.compile(r".+-part-\d+.*"), href.lower()):
chap = ""
self.chapters.append(
Chapter(
id=len(self.chapters) + 1,
title=f"{chap} {link.text.lower()}",
url=self.absolute_url(href),
# guarantee chapters like prologues listed outside vol1, are in vol1
volume=max(len(self.volumes), 1),
)
)
c_next = c_next.next_sibling
# in rare cases the volume names don't have any indicators, so we end up without any, this "fixes" that.
if not self.volumes:
self.volumes.append(
Volume(
id=1,
title="All content"
)
)
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url)
contents_html = soup.select_one("div.entry-content")
return self.cleaner.extract_contents(contents_html)
def on_site(self, href: str) -> bool:
if "http" in href.lower():
return max([href.startswith(url) for url in self.base_url])
return False
@classmethod
def is_volume(cls, text: str) -> bool:
return bool(max([x in text.lower() for x in volume_aliases]))