Adding YoutubeShort result to youtube url parsing

medialab · Feb 29, 2024 · eb152c4 · eb152c4
1 parent 9ad3ce9
commit eb152c4
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 8 deletions.
diff --git a/test/youtube_test.py b/test/youtube_test.py
@@ -2,15 +2,16 @@
 # Ural Youtube Unit Tests
 # =============================================================================
 from ural.youtube import (
+    YoutubeChannel,
+    YoutubeShort,
+    YoutubeUser,
+    YoutubeVideo,
+    extract_video_id_from_youtube_url,
+    is_youtube_channel_id,
     is_youtube_url,
     is_youtube_video_id,
-    is_youtube_channel_id,
-    parse_youtube_url,
-    extract_video_id_from_youtube_url,
     normalize_youtube_url,
-    YoutubeVideo,
-    YoutubeUser,
-    YoutubeChannel,
+    parse_youtube_url,
 )
 
 IS_TESTS = [
@@ -232,6 +233,21 @@
         YoutubeChannel(id=None, name="28minutesARTE"),
         "https://www.youtube.com/28minutesARTE",
     ),
+    (
+        "https://www.youtube.com/shorts/xnh-JKqktAU",
+        YoutubeShort(id="xnh-JKqktAU"),
+        "https://www.youtube.com/shorts/xnh-JKqktAU",
+    ),
+    (
+        "https://www.youtube.com/shorts/U5Bn8mMxj4o/nonsense?whatever",
+        YoutubeShort(id="U5Bn8mMxj4o"),
+        "https://www.youtube.com/shorts/U5Bn8mMxj4o",
+    ),
+    (
+        "https://www.youtube.com/shorts/",
+        None,
+        "https://www.youtube.com/shorts/",
+    ),
 ]
 
 

diff --git a/ural/youtube.py b/ural/youtube.py
@@ -191,6 +191,8 @@
 # but there is no way to infer this...
 YOUTUBE_CHANNEL_NAME_URL_TEMPLATE = "https://www.youtube.com/%s"
 
+YOUTUBE_SHORT_URL_TEMPLATE = "https://www.youtube.com/shorts/%s"
+
 YOUTUBE_CHANNEL_NAME_BLACKLIST = {
     "about",
     "account",
@@ -209,7 +211,7 @@
 YoutubeVideo = namedtuple("YoutubeVideo", ["id", "playlist"])
 YoutubeUser = namedtuple("YoutubeUser", ["id", "name"])
 YoutubeChannel = namedtuple("YoutubeChannel", ["id", "name"])
-
+YoutubeShort = namedtuple("YoutubeShort", ["id"])
 
 # NOTE: we use a trie to perform efficient queries and so we don't
 # need to test every domain/subdomain linearly
@@ -369,6 +371,22 @@ def parse_youtube_url(url, fix_common_mistakes=True):
 
         return YoutubeChannel(id=cid, name=None)
 
+    elif path.startswith("/shorts/"):
+        splitted_path = pathsplit(path)
+
+        if len(splitted_path) < 2:
+            return None
+
+        v = splitted_path[1]
+
+        if fix_common_mistakes:
+            v = v[:11]
+
+        if not is_youtube_video_id(v):
+            return
+
+        return YoutubeShort(id=v)
+
     else:
         path = path.rstrip("/")
         if path.count("/") == 1:
@@ -414,4 +432,8 @@ def normalize_youtube_url(url):
 
         return YOUTUBE_CHANNEL_NAME_URL_TEMPLATE % parsed.name
 
+    if isinstance(parsed, YoutubeShort):
+        if parsed.id is not None:
+            return YOUTUBE_SHORT_URL_TEMPLATE % parsed.id
+
     raise TypeError("normalize_youtube_url: impossible path reached")
diff --git a/ural/youtube.pyi b/ural/youtube.pyi
@@ -12,11 +12,14 @@ class YoutubeChannel(NamedTuple):
     id: str
     name: str
 
+class YoutubeShort(NamedTuple):
+    id: str
+
 def is_youtube_url(url: AnyUrlTarget) -> bool: ...
 def is_youtube_video_id(value: str) -> bool: ...
 def is_youtube_channel_id(value: str) -> bool: ...
 def parse_youtube_url(
     url: AnyUrlTarget, fix_common_mistakes: bool = ...
-) -> Optional[Union[YoutubeVideo, YoutubeUser, YoutubeChannel]]: ...
+) -> Optional[Union[YoutubeVideo, YoutubeUser, YoutubeChannel, YoutubeShort]]: ...
 def extract_video_id_from_youtube_url(url: AnyUrlTarget) -> Optional[str]: ...
 def normalize_youtube_url(url: AnyUrlTarget) -> str: ...