From 1cf0e3c31b4b2b8e1d26211bdecf1c852fb29540 Mon Sep 17 00:00:00 2001 From: Paul Pritchard Date: Sun, 23 Sep 2018 17:34:01 -0500 Subject: [PATCH] [nbc] Add support for series playlists --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/nbc.py | 102 ++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 464c8d690..4d9d92438 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -684,6 +684,7 @@ from .nbc import ( NBCSportsIE, NBCSportsStreamIE, NBCSportsVPlayerIE, + NBCPlaylistIE, ) from .ndr import ( NDRIE, @@ -1367,7 +1368,7 @@ from .webofstories import ( WebOfStoriesPlaylistIE, ) from .weibo import ( - WeiboIE, + WeiboIE, WeiboMobileIE ) from .weiqitv import WeiqiTVIE diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 765c46fd2..dd2ef95f1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -15,7 +15,103 @@ from ..utils import ( unescapeHTML, update_url_query, int_or_none, + url_or_none, + js_to_json, + urljoin, + parse_duration, ) +import datetime + + +class NBCPlaylistIE(InfoExtractor): + IE_DESC = 'NBC series playlists' + IE_NAME = 'nbc.com:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)nbc.com/(?P[\w-]+)/episodes?$' + _TESTS = [ + { + 'url': 'https://www.nbc.com/saved-by-the-bell/episodes', + 'info_dict': { + 'id': '11d71744-7ca0-4e79-9547-e239371533d7', + 'title': 'Saved by the Bell', + }, + 'playlist_count': 99, + }, + { + 'url': 'https://www.nbc.com/heroes/episodes', + 'info_dict': { + 'id': '99d3a2c1-fd98-43b9-a7a4-f7872b0eb808', + 'title': 'Heroes', + }, + 'playlist_count': 78, + }, + ] + + def extract_episode_info(self, url, json_data): + + series_title = '' + included = json_data['included'] + for inc in included: + if inc.get('type') == 'shows': + series_title = inc.get('attributes', {}).get('shortTitle') + + entries = [] + for ep in json_data['data']: + type = ep.get('type') + if type == 'videos': + atr = ep.get('attributes') + if atr: + image_url = '' + image_id = ep.get('relationships', {}).get('image', {}).get('data', {}).get('id') + if image_id: + for inc in included: + if inc.get('type') == 'images' and inc.get('id') == image_id: + inc_atr = inc.get('attributes') + if inc_atr: + image_url = url_or_none(urljoin(url, inc_atr.get('path'))) + episode_url = url_or_none(atr.get('fullUrl')) + if episode_url: + entries.append({ + '_type': 'url', + 'id': ep.get('id'), + 'ie_key': 'NBC', + 'title': series_title, + 'url': episode_url, + 'duration': parse_duration(atr.get('runTime')), + 'thumbnail': image_url, + 'upload_date': datetime.datetime.strptime(atr.get('airdate')[:19], '%Y-%m-%dT%H:%M:%S').strftime('%Y%m%d'), + 'episode_title': atr.get('title'), + 'episode_number': int_or_none(atr.get('episodeNumber')), + 'season_number': int_or_none(atr.get('seasonNumber')), + 'series': series_title, + }) + + return {'series_title': series_title, 'episodes': entries} + + def _real_extract(self, url): + show_name = self._match_id(url) + webpage = self._download_webpage(url, show_name) + + show_name_js = self._search_regex( + r'', webpage, 'show_name') + show = self._parse_json(show_name_js, show_name, + transform_source=js_to_json) + show_id = show.get('xref', {}).get('/' + show_name) + + entries = [] + if show_id: + episodes_url = urljoin( + url, 'https://api.nbc.com/v3.14/videos?include=image%%2Cshow.image%%2Cshow.aggregates&filter%%5Bshow%%5D=%s&filter%%5Btype%%5D%%5Bvalue%%5D=Full%%20Episode&filter%%5Btype%%5D%%5Boperator%%5D=%%3D&page%%5Bnumber%%5D=1&page%%5Bsize%%5D=50&sort=-airdate' % show_id) + while (episodes_url): + json_data = self._download_json( + episodes_url, 'Downloading episode playlist') + result = self.extract_episode_info(url, json_data) + entries += result['episodes'] + series_title = result['series_title'] + episodes_url = json_data.get('links', {}).get('next') + + playlist = self.playlist_result( + entries, show_id, playlist_title=series_title) + return playlist class NBCIE(AdobePassIE): @@ -214,7 +310,8 @@ class NBCSportsStreamIE(AdobePassIE): break else: source_url = video_source['ottStreamUrl'] - is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' + is_live = video_source.get( + 'type') == 'live' or video_source.get('status') == 'Live' resource = self._get_mvpd_resource('nbcsports', title, video_id, '') token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) tokenized_url = self._download_json( @@ -377,7 +474,8 @@ class NBCNewsIE(ThePlatformIE): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') if video_id is not None: - all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + all_info = self._download_xml( + 'http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) info = all_info.find('video') return {