From 1ac65a003a77781509e5ffcd8183071d39b9f345 Mon Sep 17 00:00:00 2001 From: Paul Pritchard Date: Sun, 30 Sep 2018 12:42:13 -0500 Subject: [PATCH] [cbs] Add support for series playlists --- youtube_dl/extractor/cbs.py | 113 +++++++++++++++++++++++++---- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 101 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1799d63ea..5f20654e3 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,19 +1,25 @@ from __future__ import unicode_literals +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE from ..utils import ( - ExtractorError, int_or_none, + url_or_none, + js_to_json, find_xpath_attr, + parse_duration, xpath_element, xpath_text, update_url_query, + urljoin, ) +import datetime class CBSBaseIE(ThePlatformFeedIE): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') + closed_caption_e = find_xpath_attr(smil, self._xpath_ns( + './/param', namespace), 'name', 'ClosedCaptionURL') return { 'en': [{ 'ext': 'ttml', @@ -22,6 +28,88 @@ class CBSBaseIE(ThePlatformFeedIE): } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] +class CBSPlaylistIE(InfoExtractor): + IE_DESC = 'CBS series playlists' + IE_NAME = 'cbs.com:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)cbs.com/shows/(?P[\w-]+)/?$' + _TESTS = [ + { + 'url': 'https://www.cbs.com/shows/frasier/', + 'info_dict': { + 'id': 61456196, + 'title': 'Frasier', + }, + 'playlist_count': 264, + }, + { + 'url': 'https://www.cbs.com/shows/star_trek/', + 'info_dict': { + 'id': 22927, + 'title': 'Star Trek: The Original Series (Remastered)', + }, + 'playlist_count': 79, + }, + ] + + def extract_episode_info(self, url, json_data): + episodes = json_data.get('result', {}).get('data') + + entries = [] + for ep in episodes: + series_title = ep.get('series_title') + episode_url = url_or_none(urljoin(url, ep.get('url'))) + if episode_url: + entries.append({ + '_type': 'url', + 'id': ep.get('content_id'), + 'ie_key': 'CBS', + 'title': ep.get('title'), + 'url': episode_url, + 'duration': parse_duration(ep.get('duration')), + 'thumbnail': ep.get('thumb', {}).get('large'), + 'upload_date': datetime.datetime.strptime(ep.get('airdate'), '%b %d, %Y').strftime('%Y%m%d'), + 'episode_title': ep.get('episode_title'), + 'episode_number': int_or_none(ep.get('episode_number')), + 'season_number': int_or_none(ep.get('season_number')), + 'series': ep.get('series_title'), + }) + + return {'series_title': series_title, 'episodes': entries} + + def _real_extract(self, url): + show_name = self._match_id(url) + webpage = self._download_webpage(url, show_name) + + show_name_js = self._search_regex( + r'new CBS\.Show\(([^)]*)\);', webpage, 'show_name') + show = self._parse_json(show_name_js, show_name, + transform_source=js_to_json) + show_id = show.get('id') + + entries = [] + if show_id: + offset = 0 + limit = 10 + more_episodes = True + while (more_episodes): + episodes_url = urljoin( + url, '/carousels/shows/%d/offset/%d/limit/%d/xs/0/' % (show_id, offset, limit)) + json_data = self._download_json( + episodes_url, 'Downloading episode playlist') + result = self.extract_episode_info(url, json_data) + entries += result['episodes'] + series_title = result['series_title'] + total_episodes = json_data.get('result', {}).get('total') + offset = offset + limit + if offset > total_episodes: + more_episodes = False + + playlist = self.playlist_result( + entries, show_id, playlist_title=series_title) + print(playlist) + return playlist + + class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' @@ -50,22 +138,21 @@ class CBSIE(CBSBaseIE): 'only_matching': True, }] - def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): + def _extract_video_info(self, content_id): items_data = self._download_xml( 'http://can.cbs.com/thunder/player/videoPlayerService.php', - content_id, query={'partner': site, 'contentId': content_id}) + content_id, query={'partner': 'cbs', 'contentId': content_id}) video_data = xpath_element(items_data, './/item') title = xpath_text(video_data, 'videoTitle', 'title', True) - tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) + tp_path = 'dJ5BDC/media/guid/2198311517/%s' % content_id tp_release_url = 'http://link.theplatform.com/s/' + tp_path asset_types = [] subtitles = {} formats = [] - last_e = None for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') - if not asset_type or asset_type in asset_types or asset_type in ('HLS_FPS', 'DASH_CENC'): + if not asset_type or asset_type in asset_types: continue asset_types.append(asset_type) query = { @@ -76,17 +163,11 @@ class CBSIE(CBSBaseIE): query['formats'] = 'MPEG4,M3U' elif asset_type in ('RTMP', 'WIFI', '3G'): query['formats'] = 'MPEG4,FLV' - try: - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query(tp_release_url, query), content_id, - 'Downloading %s SMIL data' % asset_type) - except ExtractorError as e: - last_e = e - continue + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data' % asset_type) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) - if last_e and not formats: - raise last_e self._sort_formats(formats) info = self._extract_theplatform_metadata(tp_path, content_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4d9d92438..b951ce7dd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -173,7 +173,10 @@ from .cbc import ( CBCWatchIE, CBCOlympicsIE, ) -from .cbs import CBSIE +from .cbs import ( + CBSIE, + CBSPlaylistIE, +) from .cbslocal import CBSLocalIE from .cbsinteractive import CBSInteractiveIE from .cbsnews import (