Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tube8] Fixed video download and added channel download #32839

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1327,7 +1327,10 @@
)
from .trunews import TruNewsIE
from .trutv import TruTVIE
from .tube8 import Tube8IE
from .tube8 import (
Tube8IE,
Tube8ListIE,
)
from .tubitv import TubiTvIE
from .tumblr import TumblrIE
from .tunein import (
Expand Down
308 changes: 243 additions & 65 deletions youtube_dl/extractor/tube8.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,264 @@
# coding: utf-8
from __future__ import unicode_literals
FildCommander marked this conversation as resolved.
Show resolved Hide resolved

import itertools
import re
from time import sleep

from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_class,
get_element_by_id,
int_or_none,
str_to_int,
parse_qs,
strip_or_none,
T,
traverse_obj,
url_or_none,
urljoin,
)
from .keezmovies import KeezMoviesIE


class Tube8IE(KeezMoviesIE):
_VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
class Tube8IE(InfoExtractor):
_VALID_URL = r'https?:\/\/(?:www\.)?tube8\.com\/+porn-video+\/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
'md5': '65e20c48e6abff62ed0c3965fff13a39',
'url': 'https://www.tube8.com/porn-video/189530841/',
'md5': '532408f59e89a32027d873af6289c85a',
'info_dict': {
'id': '229795',
'display_id': 'kasia-music-video',
'id': '189530841',
'ext': 'mp4',
'description': 'hot teen Kasia grinding',
'uploader': 'unknown',
'title': 'Kasia music video',
'title': 'Found dildo. She let it cum in her tight ass to keep the secret',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'MaryKrylova',
'age_limit': 18,
'duration': 230,
'categories': ['Teen'],
'tags': ['dancing'],
Comment on lines -26 to -27
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like these aren't being shown on the test video page. but age_limit must be set and duration is easily available. See comments below.

},
}, {
'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/',
'only_matching': True,
}
}]

@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)',
webpage)
_EMBED_REGEX = r'<iframe [^>]*\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)*\d+)'

@classmethod
def _extract_urls(cls, webpage):
return [m.group('url') for m in re.finditer(cls._EMBED_REGEX, webpage)]

def _real_extract(self, url):
FildCommander marked this conversation as resolved.
Show resolved Hide resolved
webpage, info = self._extract_info(url)

if not info['title']:
info['title'] = self._html_search_regex(
r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')

description = self._html_search_regex(
r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False)
uploader = self._html_search_regex(
r'<span class="username">\s*(.+?)\s*<',
webpage, 'uploader', fatal=False)

like_count = int_or_none(self._search_regex(
r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
dislike_count = int_or_none(self._search_regex(
r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
view_count = str_to_int(self._search_regex(
r'Views:\s*</dt>\s*<dd>([\d,\.]+)',
webpage, 'view count', fatal=False))
comment_count = str_to_int(self._search_regex(
r'<span id="allCommentsCount">(\d+)</span>',
webpage, 'comment count', fatal=False))

category = self._search_regex(
r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)',
webpage, 'category', fatal=False)
categories = [category] if category else None

tags_str = self._search_regex(
r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)',
webpage, 'tags', fatal=False)
tags = [t for t in re.findall(
r'<a[^>]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
FildCommander marked this conversation as resolved.
Show resolved Hide resolved

age_verify_msg = self._search_regex(
r'(your elected officials in \w+(?:\s+\w+){,2} are requiring us to verify your age before allowing you access to our website)',
webpage, 'age verification message', default=None)
if age_verify_msg:
self.raise_geo_restricted('%s said: "%s"' % (self.IE_NAME, age_verify_msg))

playervars = self._search_json(
r'\bplayervars\s*:', webpage, 'playervars', video_id)

info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={})
for k in ('url', 'description'):
info.pop(k, None)

info['uploader'] = clean_html(get_element_by_class('submitByLink', webpage))

# Borrowed from youporn extractor
def get_fmt(x):
v_url = url_or_none(x.get('videoUrl'))
if v_url:
x['videoUrl'] = v_url
return (x['format'], x)

defs_by_format = dict(traverse_obj(playervars, (
'mediaDefinitions', lambda _, v: v.get('format'), T(get_fmt))))

info['title'] = strip_or_none(playervars.get('video_title')) or info.get('title') or playervars['video_title']
if not info.get('thumbnail'):
info['thumbnail'] = traverse_obj(playervars, ('image_url', T(url_or_none)))

# Borrowed from youporn extractor
def get_format_data(f):
if f not in defs_by_format:
return []
return self._download_json(
defs_by_format[f]['videoUrl'], video_id, '{0}-formats'.format(f))

formats = []
for mp4_url in traverse_obj(
get_format_data('mp4'),
(lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'),
(Ellipsis, 'videoUrl'),
expected_type=url_or_none):
height, tbr = map(int_or_none, self._search_regex(r'(?i)(?P<height>\d{3,4})p_(?P<bitrate>\d+)k_\d+', mp4_url, 'media details', group=('height', 'bitrate'), fatal=False))
fmt_id = '%dp' % height if height else 'mp4'
formats.append({
'format_id': fmt_id,
'url': mp4_url,
'ext': 'mp4',
'height': height,
'tbr': tbr,
})

self._sort_formats(formats)

info.update({
'description': description,
'uploader': uploader,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
'comment_count': comment_count,
'categories': categories,
'tags': tags,
'id': video_id,
'formats': formats,
'age_limit': 18,
})

return info


# Currently only user channels
class Tube8ListIE(InfoExtractor):
_PLAYLIST_TITLEBAR_RE = r'\s+[Vv]ideos\s*$'
_PAGE_RETRY_COUNT = 0 # ie, no retry
_PAGE_RETRY_DELAY = 2 # seconds

_VALID_URL = r'https?:\/\/(?:www\.)?tube8\.com\.?\/+user-videos\/+(?P<id>\d+)\/+(?P<author>[^\/]+)\/?.*'
_TESTS = [{
'url': 'https://www.tube8.com/user-videos/195075441/MaryKrylova/',
'info_dict': {
'id': '195075441',
},
'playlist_mincount': 29,
}, {
'url': 'https://www.tube8.com/user-videos/195048331/FoxyElf/',
'info_dict': {
'id': '195048331',
},
'playlist_mincount': 86,
}]

# Borrowed from youporn extractor
@classmethod
def _get_title_from_slug(cls, title_slug):
return re.sub(r'[_-]', ' ', title_slug)

# Borrowed from youporn extractor
def _get_next_url(self, url, pl_id, html):
return urljoin(url, self._search_regex(
r'''<a\s[^>]*?\bhref\s*=\s*("|')(?P<url>(?:(?!\1)[^>])+)\1''',
get_element_by_id('next', html) or '', 'next page',
group='url', default=None))

# Borrowed from youporn extractor
def _entries(self, url, pl_id, html=None, page_num=None):

# separates page sections
PLAYLIST_SECTION_RE = (
r'''<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?title-bar(?:\s+[\w$-]+|\s)*\1[^>]*>'''
)
# contains video link
VIDEO_URL_RE = r'''(?x)
<div\s[^>]*\bdata-video-id\s*=\s*('|")\d+\1[^>]*>\s*
(?:<div\b[\s\S]+?</div>\s*)*
<a\s[^>]*\bhref\s*=\s*('|")(?P<url>(?:(?!\2)[^>])+)\2
'''

def yield_pages(url, html=html, page_num=page_num):
fatal = not html
for pnum in itertools.count(start=page_num or 1):
if not html:
html = self._download_webpage(
url, pl_id, note='Downloading page %d' % pnum,
fatal=fatal)
if not html:
break
fatal = False
yield (url, html, pnum)
# explicit page: extract just that page
if page_num is not None:
break
next_url = self._get_next_url(url, pl_id, html)
if not next_url or next_url == url:
break
url, html = next_url, None

def retry_page(msg, tries_left, page_data):
if tries_left <= 0:
return
self.report_warning(msg, pl_id)
sleep(self._PAGE_RETRY_DELAY)
return next(
yield_pages(page_data[0], page_num=page_data[2]), None)

def yield_entries(html):
for frag in re.split(PLAYLIST_SECTION_RE, html):
if not frag:
continue
t_text = get_element_by_class('title-text', frag or '')
if not (t_text and re.search(self._PLAYLIST_TITLEBAR_RE, t_text)):
continue
for m in re.finditer(VIDEO_URL_RE, frag):
video_url = urljoin(url, m.group('url'))
if video_url:
yield self.url_result(video_url)

last_first_url = None
for page_data in yield_pages(url, html=html, page_num=page_num):
# page_data: url, html, page_num
first_url = None
tries_left = self._PAGE_RETRY_COUNT + 1
while tries_left > 0:
tries_left -= 1
for from_ in yield_entries(page_data[1]):
# may get the same page twice instead of empty page
# or (site bug) intead of actual next page
if not first_url:
first_url = from_['url']
if first_url == last_first_url:
# sometimes (/porntags/) the site serves the previous page
# instead but may provide the correct page after a delay
page_data = retry_page(
'Retrying duplicate page...', tries_left, page_data)
if page_data:
first_url = None
break
continue
yield from_
else:
if not first_url and 'no-result-paragarph1' in page_data[1]:
page_data = retry_page(
'Retrying empty page...', tries_left, page_data)
if page_data:
continue
else:
# success/failure
break
# may get an infinite (?) sequence of empty pages
if not first_url:
break
last_first_url = first_url

# Borrowed from youporn extractor
def _real_extract(self, url, html=None):
# exceptionally, id may be None
m_dict = self._match_valid_url(url).groupdict()
pl_id, page_type, sort = (m_dict.get(k) for k in ('id', 'type', 'sort'))

qs = parse_qs(url)
for q, v in qs.items():
if v:
qs[q] = v[-1]
else:
del qs[q]

base_id = pl_id or 'Tube8'
title = self._get_title_from_slug(base_id)
if page_type:
title = '%s %s' % (page_type.capitalize(), title)
base_id = [base_id.lower()]
if sort is None:
title += ' videos'
else:
title = '%s videos by %s' % (title, re.sub(r'[_-]', ' ', sort))
base_id.append(sort)
if qs:
ps = ['%s=%s' % item for item in sorted(qs.items())]
title += ' (%s)' % ','.join(ps)
base_id.extend(ps)
pl_id = '/'.join(base_id)

return self.playlist_result(
self._entries(url, pl_id, html=html,
page_num=int_or_none(qs.get('page'))),
playlist_id=pl_id)