Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AENetworks] Update extractor for the current site(s) #26795

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 131 additions & 31 deletions youtube_dl/extractor/aenetworks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals

import json
import re

from .theplatform import ThePlatformIE
Expand All @@ -20,6 +21,13 @@ class AENetworksBaseIE(ThePlatformIE):
_THEPLATFORM_KEY = 'crazyjava'
_THEPLATFORM_SECRET = 's3cr3t'

def _parse_theplatform_metadata(self, info):
metadata = super()._parse_theplatform_metadata(info)
metadata['season_number'] = int(info.get('AETN$season'))
metadata['episode_number'] = int(info.get('AETN$episode'))
metadata['series'] = info.get('AETN$seriesNameGlobal')
return metadata

def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = {'mbr': 'true'}
if auth:
Expand Down Expand Up @@ -63,7 +71,7 @@ class AENetworksIE(AENetworksBaseIE):
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
(?:(?P<subdomain>www|play)\.)?
(?P<domain>
(?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
fyi\.tv
Expand All @@ -75,11 +83,26 @@ class AENetworksIE(AENetworksBaseIE):
collections/[^/]+/(?P<collection_display_id>[^/]+)
)
'''
_GRAPHQL_QUERY = """
fragment video on Video {
publicUrl
}

query getUserVideo($videoId: ID!) {
video(id: $videoId) {
...video
}
}
"""

_TESTS = [{
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
'info_dict': {
'id': '22253814',
'ext': 'mp4',
'series': 'Mountain Men',
'season_number': 1,
'episode_number': 1,
'title': 'Winter is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
'timestamp': 1338306241,
Expand All @@ -91,19 +114,60 @@ class AENetworksIE(AENetworksBaseIE):
'skip_download': True,
},
'add_ie': ['ThePlatform'],
}, {
'url': 'http://play.history.com/shows/mountain-men/season-1/episode-1',
'info_dict': {
'id': '22253814',
'ext': 'mp4',
'series': 'Mountain Men',
'season_number': 1,
'episode_number': 1,
'title': 'Winter Is Coming',
'description': 'md5:a40e370925074260b1c8a633c632c63a',
'timestamp': 1338306241,
'upload_date': '20120529',
'uploader': 'AENE-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['ThePlatform'],
}, {
'url': 'http://www.history.com/shows/ancient-aliens/season-1',
'info_dict': {
'id': '71889446852',
'title': 'Ancient Aliens'
},
'playlist_mincount': 5,
}, {
'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
'url': 'http://www.mylifetime.com/shows/marrying-millions',
'info_dict': {
'id': 'SERIES4317',
'title': 'Atlanta Plastic',
'id': 'SERIES6093',
'title': 'Marrying Millions',
},
'playlist_mincount': 2,
'playlist_mincount': 1,
}, {
'url': 'http://www.mylifetime.com/shows/marrying-millions/season-1',
'info_dict': {
'id': '269343782619',
'title': 'Marrying Millions',
},
'playlist_mincount': 10,
}, {
'url': 'https://play.mylifetime.com/shows/marrying-millions/season-1',
'info_dict': {
'id': 'SERIES6093',
'title': 'Marrying Millions',
},
'playlist_mincount': 10,
}, {
'url': 'https://play.mylifetime.com/shows/marrying-millions',
'info_dict': {
'id': 'SERIES6093',
'title': 'Marrying Millions',
},
'playlist_mincount': 11,
}, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'only_matching': True
Expand Down Expand Up @@ -137,56 +201,92 @@ class AENetworksIE(AENetworksBaseIE):
'fyi.tv': 'FYI',
}

def _real_extract(self, url):
domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups()
display_id = show_path or movie_display_id or special_display_id or collection_display_id
webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers())
if show_path:
url_parts = show_path.split('/')
def _extract_playlist(self, url, webpage, display_id, subdomain, url_parts):
# The "play" is pretty distinct from the normal sites, however, it contains all the data we need in a JSON blob.
if subdomain == 'play':
series_id = self._search_regex(r'showid/(SERIES[0-9]+)', webpage, 'series id')
season_num = int_or_none(self._search_regex(r'/season-([0-9]+)/?', url, 'season number', fatal=False, default=None))
show_data = self._parse_json(
self._search_regex(r'(?s)<script[^>]+id="__NEXT_DATA__"[^>]*>(.+?)</script', webpage, 'show data'),
series_id, fatal=True)
if show_data:
apolloState = show_data.get('props', {}).get('apolloState', {})
entries = []
for key, episode in apolloState.items():
if not key.startswith('Episode:') or series_id != episode.get('seriesId'):
continue
# If a season number was specified in the URL, filter out any episodes that don't match.
if season_num and season_num != episode.get('tvSeasonNumber'):
continue
episode_url = compat_urlparse.urljoin(url, episode.get('canonical'))
entries.append(self.url_result(episode_url, 'AENetworks', episode.get('id'), episode.get('title')))
series_name = apolloState.get('Series:%s' % series_id, {}).get('title')
return self.playlist_result(entries, series_id, series_name)
else:
series_title = self._html_search_meta('aetn:SeriesTitle', webpage)
url_parts_len = len(url_parts)
if url_parts_len == 1:
entries = []
for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
for season_url_path in re.findall(r'(?s)<a[^>]+href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
entries.append(self.url_result(
compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
if entries:
return self.playlist_result(
entries, self._html_search_meta('aetn:SeriesId', webpage),
self._html_search_meta('aetn:SeriesTitle', webpage))
else:
# single season
url_parts_len = 2
entries, self._html_search_meta('aetn:SeriesId', webpage), series_title)
raise ExtractorError('Failed to extract seasons for show: %s' % url_parts[0])
if url_parts_len == 2:
entries = []
for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
for episode_item in re.findall(r'(?s)<[^>]+data-episodetype[^>]*>', webpage):
episode_attributes = extract_attributes(episode_item)
episode_url = compat_urlparse.urljoin(
url, episode_attributes['data-canonical'])
entries.append(self.url_result(
episode_url, 'AENetworks',
episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
video_id = episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')
episode_title = episode_attributes.get('aria-label')
entries.append(self.url_result(episode_url, 'AENetworks', video_id, episode_title))
return self.playlist_result(
entries, self._html_search_meta('aetn:SeasonId', webpage))

video_id = self._html_search_meta('aetn:VideoID', webpage)
media_url = self._search_regex(
[r"media_url\s*=\s*'(?P<url>[^']+)'",
r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)',
r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'],
webpage, 'video url', group='url')
entries, self._html_search_meta('aetn:SeasonId', webpage), series_title)
raise ExtractorError('Failed to extract playlist', video_id=display_id)

def _real_extract(self, url):
subdomain, domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups()
display_id = show_path or movie_display_id or special_display_id or collection_display_id
webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers())

if show_path:
url_parts = show_path.split('/')
# If there's only the show name and/or season number then we'll need to extract a playlist.
if len(url_parts) < 3:
return self._extract_playlist(url, webpage, display_id, subdomain, url_parts)

requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
video_id = self._html_search_meta(['videoId', 'aetn:VideoID'], webpage)
# Make a GraphQL query to get the episode URL as they no longer directly embed it in the response webpage.
video_data = self._download_json(
'https://yoga.appsvcs.aetnd.com/graphql?brand=%s&mode=live&platform=web' % (requestor_id.lower()), video_id,
data=json.dumps(
{
'operationName': 'getUserVideo',
'variables': {'videoId': video_id},
'query': self._GRAPHQL_QUERY,
}).encode('utf-8'),
headers={'Content-Type': 'application/json'})
media_url = video_data.get('data', {}).get('video', {}).get('publicUrl')
if not media_url:
raise ExtractorError('Failed to extract media URL', video_id=video_id)
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
auth = None
if theplatform_metadata.get('AETN$isBehindWall'):
requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
resource = self._get_mvpd_resource(
requestor_id, theplatform_metadata['title'],
theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),
theplatform_metadata['ratings'][0]['rating'])
auth = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
info.update(self._search_json_ld(webpage, video_id, fatal=False))
# JSON-LD data isn't present on the play subdomain webpages.
if subdomain != 'play':
info.update(self._search_json_ld(webpage, video_id, fatal=False))
info.update(self._extract_aen_smil(media_url, video_id, auth))
return info

Expand Down