From 41a33b2357bee0b0a374632582c418fe930de2a1 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 22 Feb 2017 20:23:15 +0100 Subject: [PATCH] [srgssr] Migrate to integrationlayer version 2.0 API The SRGSSR Play websites now often uses the integrationlayer version 2.0 API instead of version 1.0. I have modified the SRGSSR information extractor to use this new integrationlayer instead of the old one. All the old media supports this new version too, so there is no need to stick with the old one. It's possible, that the support for the old integrationlayer will be dropped, so this switch has to made anyway. Here is a list of the changes: - Use integrationlayer version 2.0 API instead of version 1.0. - Assure consistant media IDs. In the old version of the information extractor, youtube-dl extracts the same video for the urls "http://www.srf.ch/play/tv/schweiz-aktuell/video/schweiz-aktuell-vom-22-02-2017?id=d0206674-6125-49ef-b85d-3cf36d24d582" and "http://www.srf.ch/play/tv/schweiz-aktuell/video/walliser-baubaubranche-wehrt-sich?id=967590f0-f812-4941-8f6a-06a2db7bd083", but uses different media IDs. Now it still extracts the same videos (since there is no support to cut videos into parts in youtube-dl, right?), but it uses the same media IDs. So we always have consistant media IDs for the media. - Add extraction of media duration. - Add extraction of video subtitles. - Use multiline regular expressions for _VALID_URL for better readability. - Indicate direct podcast downloads in format_id. - Update tests. --- youtube_dl/extractor/srgssr.py | 319 ++++++++++++++++++++++++++------- 1 file changed, 253 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index bb73eb1d5..752611b0b 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -7,13 +7,36 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, parse_iso8601, + str_or_none, qualities, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf| + rts| + rsi| + rtr| + swi + ):(?:[^:]+:)? + (?P + video| + audio + ): + (?P + [0-9a-f\-]{36}| + \d+ + ) + ''' + _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -36,61 +59,122 @@ class SRGSSRIE(InfoExtractor): url += '?' + auth_params return url - def get_media_data(self, bu, media_type, media_id): + def get_media_id_and_data(self, bu, media_type, url_id): media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' % (bu, media_type, url_id), url_id) + media_id = self._search_regex( + r'urn:%s:%s:([0-9a-f\-]{36}|\d+)' % (bu, media_type), + media_data['chapterUrn'], + 'media id') - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + episode_data = media_data.get('episode', {}) + + chapter_list = media_data.get('chapterList', []) + data = [] + if chapter_list: + data.extend([item for item in chapter_list if item.get('id') == media_id]) + if not data: + raise ExtractorError('%s said: Cannot extract chapter information.' % self.IE_NAME) + + chapter_data = data[0] + block_reason = str_or_none(chapter_data.get('blockReason')) + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) + elif block_reason: + message = 'This media is not available. Reason: %s' % block_reason + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message)) + + return media_id, episode_data, chapter_data + + def _get_subtitles(self, bu, media_data): + subtitles = {} + subtitle_data = media_data.get('subtitleList', []) + + default_language_codes = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } + known_formats = ('TTML', 'VTT') + for sub in subtitle_data: + form = sub['format'] + if form not in known_formats: + continue + lang = sub.get('locale') or default_language_codes[bu] + subtitles.setdefault(lang, []).append({ + 'ext': form.lower(), + 'url': sub['url'] + }) + + # Prefer VTT subtitles over TTML: + priorities = { + 'ttml': 1, + 'vtt': 2, + } + for lang in subtitles: + subtitles[lang].sort(key=lambda x: priorities[x['ext']]) + + return subtitles - return media_data def _real_extract(self, url): - bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + bu, media_type, url_id = re.match(self._VALID_URL, url).groups() - media_data = self.get_media_data(bu, media_type, media_id) + media_id, episode_data, chapter_data = self.get_media_id_and_data(bu, media_type, url_id) - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') + is_episode = True if chapter_data['position'] == 0 else False + title = episode_data['title'] if is_episode else chapter_data['title'] + description = chapter_data.get('description') + duration = float_or_none(chapter_data['duration'], scale=1000) + created_date = chapter_data.get('date') timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] + thumbnail = chapter_data.get('imageUrl') + subtitles = self._get_subtitles(bu, chapter_data) preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) + for source in chapter_data.get('resourceList', []): + protocol = str_or_none(source['protocol']) + quality = str_or_none(source['quality']) + encoding = str_or_none(source['encoding']) + format_url = source.get('url') + format_id = '%s-%s-%s' % (protocol, encoding, quality) + + if protocol in ('HDS', 'HLS'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if protocol == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS', 'RTMP'): + formats.append({ + 'format_id': format_id, + 'ext': encoding.lower() if encoding else None, + 'url': format_url, + 'preference': preference(quality) + }) + + podcast_keys = ('podcastSdUrl', 'podcastHdUrl') + podcast_qualities = ('SD', 'HD') + if chapter_data['position'] == 0: + for key, quality in zip(podcast_keys, podcast_qualities): + if chapter_data.get(key): formats.append({ - 'format_id': format_id, - 'url': asset_url, + 'format_id': 'PODCAST-HTTP-%s' % quality, + 'url': chapter_data[key], 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, }) self._sort_formats(formats) @@ -98,71 +182,174 @@ class SRGSSRIE(InfoExtractor): 'id': media_id, 'title': title, 'description': description, + 'duration': duration, 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'thumbnail': thumbnail, + 'subtitles': subtitles, 'formats': formats, } class SRGSSRPlayIE(InfoExtractor): IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?Psrf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?Pvideo|audio)/[^?]+\?id=(?P[0-9a-f\-]{36}|\d+)' - + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + www| + play + )\. + )? + (?P + srf| + rts| + rsi| + rtr| + swissinfo + )\.ch/play/ + (?: + tv| + radio + )/[^/]+/ + (?P + video| + audio + )/[^?]+\?id= + (?P + [0-9a-f\-]{36}| + \d+ + ) + ''' _TESTS = [{ + # ID in url not the same as media ID, no Save button, no description 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', 'info_dict': { - 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'id': '0d181f8f-07ae-46b2-8d3b-7619c0efb0e3', 'ext': 'mp4', + 'title': '10vor10 vom 01.07.2013', + 'description': None, + 'duration': 1489.921, 'upload_date': '20130701', - 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'thumbnail': r're:^https?://.*1436737120\.png$', }, }, { + # ID in url is the same as video ID, with Save button, german TTML and VTT subtitles (default language) + 'url': 'http://www.srf.ch/play/tv/rundschau/video/schwander-rot-gruene-stadtpolitik-min-li-marti-tamilen-kirche?id=2da578e3-dbb4-4657-a539-f01089a67831', + 'md5': 'b32af364dc9821af183da8dc1433da56', + 'info_dict': { + 'id': '2da578e3-dbb4-4657-a539-f01089a67831', + 'ext': 'mp4', + 'title': 'Schwander, Rot-Grüne Stadtpolitik, Min Li Marti, Tamilen-Kirche', + 'description': 'Verbissener Kampf / Vertreibung der Büezer / Theke: Min Li Marti / Geldsegen für den Pastor', + 'duration': 2630.0, + 'upload_date': '20170208', + 'timestamp': 1486583589, + 'thumbnail': r're:^https?://.*1486587225\.png$', + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': 're:^https://.*\.ttml$', + }, { + 'ext': 'vtt', + 'url': 're:^https://.*\.vtt$', + }] + }, + }, + }, { + # Audio media with RTMP stream 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'ext': 'mp3', - 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'upload_date': '20151013', + 'timestamp': 1444709160, + 'thumbnail': r're:^https?://.*1453369436\.jpg$', }, 'params': { - # rtmp download 'skip_download': True, }, }, { + # Video with french TTML subtitles (default language) 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', 'info_dict': { 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, - 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, + 'thumbnail': r're:^https?://.*image', + 'subtitles': { + 'fr': [{ + 'ext': 'ttml', + 'url': 're:^https://.*\.xml$' + }] + }, }, 'params': { - # m3u8 download 'skip_download': True, - } + }, + }, { + # Video with many subtitles in different languages (explicit language definitions) + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:8c5c1b6a2a37c17670cf87f608ff4755', + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': 'https://www.swissinfo.ch/srgscalableimage/42961964', + 'subtitles': { + 'ar': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'de': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'en': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'es': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'fr': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'it': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'ru': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'zh': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + }, + }, + 'params': { + 'skip_download': True, + }, + }, { + # Audio, whole episode + 'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/annur-moschee---rachezug-gegen-informanten?id=576a1fca-3cbd-48d7-be2f-e6dfc62a39d2', + 'info_dict': { + 'id': '576a1fca-3cbd-48d7-be2f-e6dfc62a39d2', + 'ext': 'mp3', + 'title': 'Echo der Zeit vom 21.02.2017 18:00:00', + 'description': 'md5:a23d6a67d203083f4b044f88b54020d4', + 'duration': 2419.07, + 'upload_date': '20170221', + 'timestamp': 1487696400, + 'thumbnail': r're:https://.*448775\.170221_echo_annur-winterthur-624\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Audio story, but not the whole episode + 'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/slowenisch-oesterreichischer-nachbarschaftsstreit?id=03f76721-90b8-4d7f-8c14-176e4c4c4308', + 'info_dict': { + 'id': '03f76721-90b8-4d7f-8c14-176e4c4c4308', + 'ext': 'mp3', + 'title': 'Slowenisch-österreichischer Nachbarschaftsstreit', + 'description': 'md5:4f3c5a60e12759afe578c901bbcaa574', + 'duration': 182.387, + 'upload_date': '20170221', + 'timestamp': 1487696400, + 'thumbnail': r're:https://.*448788\.170221_echo_slowenien-miro-cerar-624\.jpg', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url):