-
-
Notifications
You must be signed in to change notification settings - Fork 975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[2chen] add '.club' support #3406
Conversation
should I also remove the $ gdl -g https://2chen.club/tv/1
https://2chen.club/assets/images/src/1f4ff1f76672b796fd2d2e119595228b8267847f.webm?dl=teehee.webm
https://2chen.club/assets/images/src/cdc03ac39210e3127328bb4c7afb9c3643908bc8.jpg?dl=mpv-shot0008.jpg
https://2chen.club/assets/images/src/c13d3c88a492af14ad9c2b4976e3848a21330be6.webm?dl=1597803090690.webm
https://2chen.club/assets/images/src/2a96b761e5e9370583461159902e946a8c0995ee.mp4?dl=P7ehIvVrPB3uNa0u.mp4
https://2chen.club/assets/images/src/8718b9ca3b425c2c27e1d72b9b9ac3962c23f329.mp4?dl=6865240778499788034.mp4 |
I wouldn't bother as long as these URLs just "work", but you can if you want to. I don't really mind. If you want an absolute decision from me: No, leave them as they are. |
lets just leave it as it is then |
gallery-dl/gallery_dl/extractor/2chen.py Line 46 in a074f6f
this is no longer needed
Yes, it's better to use the url with no query params at all. Maybe |
maybe its time for some refactor diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py
index 2cf8b305..3dcb00bd 100644
--- a/gallery_dl/extractor/2chen.py
+++ b/gallery_dl/extractor/2chen.py
@@ -17,7 +17,7 @@ class _2chenThreadExtractor(Extractor):
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{time} {filename}.{extension}"
archive_fmt = "{board}_{thread}_{hash}_{time}"
- pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)/(\d+)"
+ pattern = r"((?:https?://)?2chen\.(?:moe|club))/([^/?#]+)/(\d+)"
test = (
("https://2chen.moe/tv/496715", {
"count": ">= 179",
@@ -31,47 +31,37 @@ class _2chenThreadExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.root = text.root_from_url(match.group(0))
- self.board, self.thread = match.groups()
+ root, self.board, self.thread = match.groups()
+ self.root = text.ensure_http_scheme(root)
def items(self):
url = "{}/{}/{}".format(self.root, self.board, self.thread)
page = self.request(url, encoding="utf-8", notfound="thread").text
- data = self.metadata(page)
- yield Message.Directory, data
- for post in self.posts(page):
- if not post["url"]:
- continue
- post.update(data)
- post["url"] = self.root + post["url"]
- post["time"] = text.parse_int(post["date"].timestamp())
- yield Message.Url, post["url"], text.nameext_from_url(
- post["filename"], post)
-
- def metadata(self, page):
board, pos = text.extract(page, 'class="board">/', '/<')
- title = text.extract(page, "<h3>", "</h3>", pos)[0]
- return {
+ title, _ = text.extract(page, "<h3>", "</h3>", pos)
+ data = {
"board" : board,
"thread": self.thread,
- "title" : text.unescape(title),
+ "title" : text.unescape(title)[:50],
}
- def posts(self, page):
- """Return iterable with relevant posts"""
- return map(self.parse, text.extract_iter(
- page, 'class="glass media', '</article>'))
+ yield Message.Directory, data
+ for post in text.extract_iter(
+ page, 'class="glass media', '</article>'):
+ post = self.parse(post)
+ post.update(data)
+ post["time"] = text.parse_int(post["date"].timestamp())
+ yield Message.Url, post["url"], text.nameext_from_url(
+ post["filename"], post)
def parse(self, post):
extr = text.extract_from(post)
return {
"name" : text.unescape(extr("<span>", "</span>")),
"date" : text.parse_datetime(
- extr("<time", "<").partition(">")[2],
- "%d %b %Y (%a) %H:%M:%S"
- ),
+ extr("<time>", "</time>"), "%d %b %Y (%a) %H:%M:%S"),
"no" : extr('href="#p', '"'),
- "url" : extr('</a><a href="', '"'),
+ "url" : extr('</a><a href="', '?dl='),
"filename": text.unescape(extr('download="', '"')),
"hash" : extr('data-hash="', '"'),
}
@@ -81,7 +71,7 @@ class _2chenBoardExtractor(Extractor):
"""Extractor for 2chen boards"""
category = "2chen"
subcategory = "board"
- pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)(?:/catalog|/?$)"
+ pattern = r"((?:https?://)?2chen\.(?:moe|club))/([^/?#]+)(?:/catalog|/?$)"
test = (
("https://2chen.moe/co/", {
"pattern": _2chenThreadExtractor.pattern
@@ -93,8 +83,8 @@ class _2chenBoardExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.root = text.root_from_url(match.group(0))
- self.board = match.group(1)
+ root, self.board = match.groups()
+ self.root = text.ensure_http_scheme(root)
def items(self):
url = "{}/{}/catalog".format(self.root, self.board) |
But why? A browser would send those query parameters as well, so what other reason except saving a few extra bytes per request would there be?
The only thing I would potentially change is inherit from a common base class so
What if the URLs change and
There's no need capture the entire URL a second time just to save one line of code. I'm just going to merge this as is for now. Anything else should go into a separate PR. |
I remove those functions to be same as the other chan extractors |
No description provided.