From 41a33b2357bee0b0a374632582c418fe930de2a1 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 22 Feb 2017 20:23:15 +0100 Subject: [PATCH 1/3] [srgssr] Migrate to integrationlayer version 2.0 API The SRGSSR Play websites now often uses the integrationlayer version 2.0 API instead of version 1.0. I have modified the SRGSSR information extractor to use this new integrationlayer instead of the old one. All the old media supports this new version too, so there is no need to stick with the old one. It's possible, that the support for the old integrationlayer will be dropped, so this switch has to made anyway. Here is a list of the changes: - Use integrationlayer version 2.0 API instead of version 1.0. - Assure consistant media IDs. In the old version of the information extractor, youtube-dl extracts the same video for the urls "http://www.srf.ch/play/tv/schweiz-aktuell/video/schweiz-aktuell-vom-22-02-2017?id=d0206674-6125-49ef-b85d-3cf36d24d582" and "http://www.srf.ch/play/tv/schweiz-aktuell/video/walliser-baubaubranche-wehrt-sich?id=967590f0-f812-4941-8f6a-06a2db7bd083", but uses different media IDs. Now it still extracts the same videos (since there is no support to cut videos into parts in youtube-dl, right?), but it uses the same media IDs. So we always have consistant media IDs for the media. - Add extraction of media duration. - Add extraction of video subtitles. - Use multiline regular expressions for _VALID_URL for better readability. - Indicate direct podcast downloads in format_id. - Update tests. --- youtube_dl/extractor/srgssr.py | 319 ++++++++++++++++++++++++++------- 1 file changed, 253 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index bb73eb1d5..752611b0b 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -7,13 +7,36 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, parse_iso8601, + str_or_none, qualities, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf| + rts| + rsi| + rtr| + swi + ):(?:[^:]+:)? + (?P + video| + audio + ): + (?P + [0-9a-f\-]{36}| + \d+ + ) + ''' + _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -36,61 +59,122 @@ class SRGSSRIE(InfoExtractor): url += '?' + auth_params return url - def get_media_data(self, bu, media_type, media_id): + def get_media_id_and_data(self, bu, media_type, url_id): media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' % (bu, media_type, url_id), url_id) + media_id = self._search_regex( + r'urn:%s:%s:([0-9a-f\-]{36}|\d+)' % (bu, media_type), + media_data['chapterUrn'], + 'media id') - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + episode_data = media_data.get('episode', {}) + + chapter_list = media_data.get('chapterList', []) + data = [] + if chapter_list: + data.extend([item for item in chapter_list if item.get('id') == media_id]) + if not data: + raise ExtractorError('%s said: Cannot extract chapter information.' % self.IE_NAME) + + chapter_data = data[0] + block_reason = str_or_none(chapter_data.get('blockReason')) + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) + elif block_reason: + message = 'This media is not available. Reason: %s' % block_reason + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message)) + + return media_id, episode_data, chapter_data + + def _get_subtitles(self, bu, media_data): + subtitles = {} + subtitle_data = media_data.get('subtitleList', []) + + default_language_codes = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } + known_formats = ('TTML', 'VTT') + for sub in subtitle_data: + form = sub['format'] + if form not in known_formats: + continue + lang = sub.get('locale') or default_language_codes[bu] + subtitles.setdefault(lang, []).append({ + 'ext': form.lower(), + 'url': sub['url'] + }) + + # Prefer VTT subtitles over TTML: + priorities = { + 'ttml': 1, + 'vtt': 2, + } + for lang in subtitles: + subtitles[lang].sort(key=lambda x: priorities[x['ext']]) + + return subtitles - return media_data def _real_extract(self, url): - bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + bu, media_type, url_id = re.match(self._VALID_URL, url).groups() - media_data = self.get_media_data(bu, media_type, media_id) + media_id, episode_data, chapter_data = self.get_media_id_and_data(bu, media_type, url_id) - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') + is_episode = True if chapter_data['position'] == 0 else False + title = episode_data['title'] if is_episode else chapter_data['title'] + description = chapter_data.get('description') + duration = float_or_none(chapter_data['duration'], scale=1000) + created_date = chapter_data.get('date') timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] + thumbnail = chapter_data.get('imageUrl') + subtitles = self._get_subtitles(bu, chapter_data) preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) + for source in chapter_data.get('resourceList', []): + protocol = str_or_none(source['protocol']) + quality = str_or_none(source['quality']) + encoding = str_or_none(source['encoding']) + format_url = source.get('url') + format_id = '%s-%s-%s' % (protocol, encoding, quality) + + if protocol in ('HDS', 'HLS'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if protocol == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS', 'RTMP'): + formats.append({ + 'format_id': format_id, + 'ext': encoding.lower() if encoding else None, + 'url': format_url, + 'preference': preference(quality) + }) + + podcast_keys = ('podcastSdUrl', 'podcastHdUrl') + podcast_qualities = ('SD', 'HD') + if chapter_data['position'] == 0: + for key, quality in zip(podcast_keys, podcast_qualities): + if chapter_data.get(key): formats.append({ - 'format_id': format_id, - 'url': asset_url, + 'format_id': 'PODCAST-HTTP-%s' % quality, + 'url': chapter_data[key], 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, }) self._sort_formats(formats) @@ -98,71 +182,174 @@ class SRGSSRIE(InfoExtractor): 'id': media_id, 'title': title, 'description': description, + 'duration': duration, 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'thumbnail': thumbnail, + 'subtitles': subtitles, 'formats': formats, } class SRGSSRPlayIE(InfoExtractor): IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?Psrf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?Pvideo|audio)/[^?]+\?id=(?P[0-9a-f\-]{36}|\d+)' - + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + www| + play + )\. + )? + (?P + srf| + rts| + rsi| + rtr| + swissinfo + )\.ch/play/ + (?: + tv| + radio + )/[^/]+/ + (?P + video| + audio + )/[^?]+\?id= + (?P + [0-9a-f\-]{36}| + \d+ + ) + ''' _TESTS = [{ + # ID in url not the same as media ID, no Save button, no description 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', 'info_dict': { - 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'id': '0d181f8f-07ae-46b2-8d3b-7619c0efb0e3', 'ext': 'mp4', + 'title': '10vor10 vom 01.07.2013', + 'description': None, + 'duration': 1489.921, 'upload_date': '20130701', - 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'thumbnail': r're:^https?://.*1436737120\.png$', }, }, { + # ID in url is the same as video ID, with Save button, german TTML and VTT subtitles (default language) + 'url': 'http://www.srf.ch/play/tv/rundschau/video/schwander-rot-gruene-stadtpolitik-min-li-marti-tamilen-kirche?id=2da578e3-dbb4-4657-a539-f01089a67831', + 'md5': 'b32af364dc9821af183da8dc1433da56', + 'info_dict': { + 'id': '2da578e3-dbb4-4657-a539-f01089a67831', + 'ext': 'mp4', + 'title': 'Schwander, Rot-Grüne Stadtpolitik, Min Li Marti, Tamilen-Kirche', + 'description': 'Verbissener Kampf / Vertreibung der Büezer / Theke: Min Li Marti / Geldsegen für den Pastor', + 'duration': 2630.0, + 'upload_date': '20170208', + 'timestamp': 1486583589, + 'thumbnail': r're:^https?://.*1486587225\.png$', + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': 're:^https://.*\.ttml$', + }, { + 'ext': 'vtt', + 'url': 're:^https://.*\.vtt$', + }] + }, + }, + }, { + # Audio media with RTMP stream 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'ext': 'mp3', - 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'upload_date': '20151013', + 'timestamp': 1444709160, + 'thumbnail': r're:^https?://.*1453369436\.jpg$', }, 'params': { - # rtmp download 'skip_download': True, }, }, { + # Video with french TTML subtitles (default language) 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', 'info_dict': { 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, - 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, + 'thumbnail': r're:^https?://.*image', + 'subtitles': { + 'fr': [{ + 'ext': 'ttml', + 'url': 're:^https://.*\.xml$' + }] + }, }, 'params': { - # m3u8 download 'skip_download': True, - } + }, + }, { + # Video with many subtitles in different languages (explicit language definitions) + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:8c5c1b6a2a37c17670cf87f608ff4755', + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': 'https://www.swissinfo.ch/srgscalableimage/42961964', + 'subtitles': { + 'ar': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'de': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'en': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'es': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'fr': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'it': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'ru': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + 'zh': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}], + }, + }, + 'params': { + 'skip_download': True, + }, + }, { + # Audio, whole episode + 'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/annur-moschee---rachezug-gegen-informanten?id=576a1fca-3cbd-48d7-be2f-e6dfc62a39d2', + 'info_dict': { + 'id': '576a1fca-3cbd-48d7-be2f-e6dfc62a39d2', + 'ext': 'mp3', + 'title': 'Echo der Zeit vom 21.02.2017 18:00:00', + 'description': 'md5:a23d6a67d203083f4b044f88b54020d4', + 'duration': 2419.07, + 'upload_date': '20170221', + 'timestamp': 1487696400, + 'thumbnail': r're:https://.*448775\.170221_echo_annur-winterthur-624\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Audio story, but not the whole episode + 'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/slowenisch-oesterreichischer-nachbarschaftsstreit?id=03f76721-90b8-4d7f-8c14-176e4c4c4308', + 'info_dict': { + 'id': '03f76721-90b8-4d7f-8c14-176e4c4c4308', + 'ext': 'mp3', + 'title': 'Slowenisch-österreichischer Nachbarschaftsstreit', + 'description': 'md5:4f3c5a60e12759afe578c901bbcaa574', + 'duration': 182.387, + 'upload_date': '20170221', + 'timestamp': 1487696400, + 'thumbnail': r're:https://.*448788\.170221_echo_slowenien-miro-cerar-624\.jpg', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): From 9f4f1b56dfe79bc0b5004680ca09647918d284d3 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 22 Feb 2017 20:52:13 +0100 Subject: [PATCH 2/3] [srgssr] PEP8 --- youtube_dl/extractor/srgssr.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 752611b0b..a2fc773b9 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -124,7 +124,6 @@ class SRGSSRIE(InfoExtractor): return subtitles - def _real_extract(self, url): bu, media_type, url_id = re.match(self._VALID_URL, url).groups() @@ -243,18 +242,18 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Schwander, Rot-Grüne Stadtpolitik, Min Li Marti, Tamilen-Kirche', 'description': 'Verbissener Kampf / Vertreibung der Büezer / Theke: Min Li Marti / Geldsegen für den Pastor', - 'duration': 2630.0, + 'duration': 2630.0, 'upload_date': '20170208', 'timestamp': 1486583589, 'thumbnail': r're:^https?://.*1486587225\.png$', 'subtitles': { 'de': [{ - 'ext': 'ttml', - 'url': 're:^https://.*\.ttml$', - }, { - 'ext': 'vtt', - 'url': 're:^https://.*\.vtt$', - }] + 'ext': 'ttml', + 'url': 're:^https://.*\.ttml$', + }, { + 'ext': 'vtt', + 'url': 're:^https://.*\.vtt$', + }] }, }, }, { From 7202c54fd0f258bdcbc74558dad3aac4e798ef60 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Fri, 24 Feb 2017 14:59:06 +0100 Subject: [PATCH 3/3] [srgssr] Add support for segmented videos I have fixed the problem of "different IDs for the same content". List of changes: - Revert to the old behavior of extracting media IDs. - Support segmented videos (extract only the relevant parts of the whole episode). - Reduce verbosity of _VALID_URL. --- youtube_dl/extractor/srgssr.py | 220 +++++++++++++++++---------------- 1 file changed, 115 insertions(+), 105 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index a2fc773b9..bec3730da 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -8,6 +8,7 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, float_or_none, + mimetype2ext, parse_iso8601, str_or_none, qualities, @@ -21,19 +22,13 @@ class SRGSSRIE(InfoExtractor): srgssr ): (?P - srf| - rts| - rsi| - rtr| - swi + srf|rts|rsi|rtr|swi ):(?:[^:]+:)? (?P - video| - audio + video|audio ): (?P - [0-9a-f\-]{36}| - \d+ + [0-9a-f\-]{36}|\d+ ) ''' @@ -49,35 +44,34 @@ class SRGSSRIE(InfoExtractor): 'STARTDATE': 'This video is not yet available. Please try again later.', } - def _get_tokenized_src(self, url, video_id, format_id): + def _get_tokenized_src(self, url, video_id, format_id, segment_data): sp = compat_urllib_parse_urlparse(url).path.split('/') token = self._download_json( 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), video_id, 'Downloading %s token' % format_id, fatal=False) or {} auth_params = token.get('token', {}).get('authparams') + if segment_data: + timestep_string = self._get_timestep_token(segment_data) + url += ('?' if '?' not in url else '&') + timestep_string if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_id_and_data(self, bu, media_type, url_id): - media_data = self._download_json( - 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' % (bu, media_type, url_id), url_id) - media_id = self._search_regex( - r'urn:%s:%s:([0-9a-f\-]{36}|\d+)' % (bu, media_type), - media_data['chapterUrn'], - 'media id') + def _get_timestep_token(self, segment_data): + start = str_or_none(float_or_none(segment_data['markIn'], scale=1000)) + end = str_or_none(float_or_none(segment_data['markOut'], scale=1000)) + return 'start=%s&end=%s' % (start, end) - episode_data = media_data.get('episode', {}) + def _extract_list_item(self, outer_data, key, item_id): + data_list = outer_data.get(key, []) + items = [] + if data_list: + items.extend([item for item in data_list if item.get('id') == item_id]) + if not items: + raise ExtractorError('%s said: Cannot extract %s' % (self.IE_NAME, key)) - chapter_list = media_data.get('chapterList', []) - data = [] - if chapter_list: - data.extend([item for item in chapter_list if item.get('id') == media_id]) - if not data: - raise ExtractorError('%s said: Cannot extract chapter information.' % self.IE_NAME) - - chapter_data = data[0] - block_reason = str_or_none(chapter_data.get('blockReason')) + item = items[0] + block_reason = str_or_none(item.get('blockReason')) if block_reason and block_reason in self._ERRORS: message = self._ERRORS[block_reason] if block_reason == 'GEOBLOCK': @@ -86,15 +80,31 @@ class SRGSSRIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) elif block_reason: - message = 'This media is not available. Reason: %s' % block_reason raise ExtractorError( - '%s said: %s' % (self.IE_NAME, message)) + '%s said: This media is not available. Reason %s' % (self.IE_NAME, block_reason)) + return item - return media_id, episode_data, chapter_data + def _get_ids_and_data(self, bu, media_type, url_id): + media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' % (bu, media_type, url_id), url_id) + urn_regex = r'urn:%s:%s:([0-9a-f\-]{36}|\d+)' % (bu, media_type) + chapter_id = self._search_regex( + urn_regex, media_data['chapterUrn'], 'chapter id') + segment_urn = str_or_none(media_data.get('segmentUrn')) + segment_id = self._search_regex( + urn_regex, + media_data['segmentUrn'], + 'segment id') if segment_urn else None + return chapter_id, segment_id, media_data - def _get_subtitles(self, bu, media_data): + def _get_subtitles(self, bu, media_type, chapter_data, segment_data): subtitles = {} - subtitle_data = media_data.get('subtitleList', []) + if media_type == 'audio': + return subtitles + + subtitle_data = segment_data.get( + 'subtitleList', []) if segment_data else chapter_data.get( + 'subtitleList', []) default_language_codes = { 'srf': 'de', @@ -124,19 +134,45 @@ class SRGSSRIE(InfoExtractor): return subtitles + def _get_thumbnail(self, chapter_data, segment_data): + if segment_data: + return segment_data.get('imageUrl') + else: + return chapter_data.get('imageUrl') + def _real_extract(self, url): bu, media_type, url_id = re.match(self._VALID_URL, url).groups() - media_id, episode_data, chapter_data = self.get_media_id_and_data(bu, media_type, url_id) + chapter_id, segment_id, media_data = self._get_ids_and_data( + bu, media_type, url_id) + media_id = segment_id or chapter_id + episode_data = media_data.get('episode', {}) + chapter_data = self._extract_list_item( + media_data, 'chapterList', chapter_id) + segment_data = self._extract_list_item( + chapter_data, 'segmentList', segment_id) if segment_id else None - is_episode = True if chapter_data['position'] == 0 else False - title = episode_data['title'] if is_episode else chapter_data['title'] - description = chapter_data.get('description') - duration = float_or_none(chapter_data['duration'], scale=1000) - created_date = chapter_data.get('date') + is_whole_episode = True if chapter_data['position'] == 0 and not segment_id else False + if media_type == 'video': + title = chapter_data['title'] if is_whole_episode else segment_data['title'] + description = chapter_data.get( + 'description') if is_whole_episode else segment_data.get('description') + else: + # Audio media title and description set in chapter_data only refer to + # the content of the first chapter, so we take these informations from + # episode_data in case of a multi-chapter audio media. + title = episode_data['title'] if is_whole_episode else chapter_data['title'] + description = episode_data.get( + 'description') if is_whole_episode else chapter_data.get('description') + duration = float_or_none( + segment_data['duration'], scale=1000) if segment_id else float_or_none( + chapter_data['duration'], scale=1000) + created_date = segment_data.get( + 'date') if segment_id else chapter_data.get('date') timestamp = parse_iso8601(created_date) - thumbnail = chapter_data.get('imageUrl') - subtitles = self._get_subtitles(bu, chapter_data) + thumbnail = self._get_thumbnail(chapter_data, segment_data) + subtitles = self._get_subtitles( + bu, media_type, chapter_data, segment_data) preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] @@ -144,11 +180,13 @@ class SRGSSRIE(InfoExtractor): protocol = str_or_none(source['protocol']) quality = str_or_none(source['quality']) encoding = str_or_none(source['encoding']) + mime_type = str_or_none(source.get('mimeType')) format_url = source.get('url') format_id = '%s-%s-%s' % (protocol, encoding, quality) if protocol in ('HDS', 'HLS'): - format_url = self._get_tokenized_src(format_url, media_id, format_id) + format_url = self._get_tokenized_src( + format_url, media_id, format_id, segment_data) if protocol == 'HDS': formats.extend(self._extract_f4m_formats( format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', @@ -157,17 +195,17 @@ class SRGSSRIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) - elif protocol in ('HTTP', 'HTTPS', 'RTMP'): + elif not segment_id and protocol in ('HTTP', 'HTTPS', 'RTMP'): formats.append({ 'format_id': format_id, - 'ext': encoding.lower() if encoding else None, + 'ext': mimetype2ext(mime_type) if mime_type else None, 'url': format_url, 'preference': preference(quality) }) podcast_keys = ('podcastSdUrl', 'podcastHdUrl') podcast_qualities = ('SD', 'HD') - if chapter_data['position'] == 0: + if is_whole_episode: for key, quality in zip(podcast_keys, podcast_qualities): if chapter_data.get(key): formats.append({ @@ -195,46 +233,38 @@ class SRGSSRPlayIE(InfoExtractor): https?:// (?: (?: - www| - play + www|play )\. )? (?P - srf| - rts| - rsi| - rtr| - swissinfo + srf|rts|rsi|rtr|swissinfo )\.ch/play/ (?: - tv| - radio + tv|radio )/[^/]+/ (?P - video| - audio + video|audio )/[^?]+\?id= (?P - [0-9a-f\-]{36}| - \d+ + [0-9a-f\-]{36}|\d+ ) ''' _TESTS = [{ - # ID in url not the same as media ID, no Save button, no description + # No save button, no description, only segment of a episode 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '37040a6e7caa7bd25e9aad2f2f05e449', 'info_dict': { - 'id': '0d181f8f-07ae-46b2-8d3b-7619c0efb0e3', + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', - 'title': '10vor10 vom 01.07.2013', + 'title': 'Snowden beantragt Asyl in Russland', 'description': None, - 'duration': 1489.921, + 'duration': 113.827, 'upload_date': '20130701', 'timestamp': 1372708215, - 'thumbnail': r're:^https?://.*1436737120\.png$', + 'thumbnail': r're:^https?://.*1383719781\.png$', }, }, { - # ID in url is the same as video ID, with Save button, german TTML and VTT subtitles (default language) + # With Save button, whole episode, german TTML and VTT subtitles (default language) 'url': 'http://www.srf.ch/play/tv/rundschau/video/schwander-rot-gruene-stadtpolitik-min-li-marti-tamilen-kirche?id=2da578e3-dbb4-4657-a539-f01089a67831', 'md5': 'b32af364dc9821af183da8dc1433da56', 'info_dict': { @@ -256,42 +286,6 @@ class SRGSSRPlayIE(InfoExtractor): }] }, }, - }, { - # Audio media with RTMP stream - 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', - 'info_dict': { - 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', - 'ext': 'mp3', - 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'upload_date': '20151013', - 'timestamp': 1444709160, - 'thumbnail': r're:^https?://.*1453369436\.jpg$', - }, - 'params': { - 'skip_download': True, - }, - }, { - # Video with french TTML subtitles (default language) - 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', - 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', - 'info_dict': { - 'id': '6348260', - 'display_id': '6348260', - 'ext': 'mp4', - 'title': 'Le 19h30', - 'upload_date': '20141201', - 'timestamp': 1417458600, - 'thumbnail': r're:^https?://.*image', - 'subtitles': { - 'fr': [{ - 'ext': 'ttml', - 'url': 're:^https://.*\.xml$' - }] - }, - }, - 'params': { - 'skip_download': True, - }, }, { # Video with many subtitles in different languages (explicit language definitions) 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', @@ -317,14 +311,30 @@ class SRGSSRPlayIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { - # Audio, whole episode + # Audio media with RTMP stream + 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'info_dict': { + 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'ext': 'mp3', + 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', + 'upload_date': '20151013', + 'timestamp': 1444709160, + 'thumbnail': r're:^https?://.*1453369436\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }, { + + # Audio, whole episode of a show (i.e. chapter position 0) 'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/annur-moschee---rachezug-gegen-informanten?id=576a1fca-3cbd-48d7-be2f-e6dfc62a39d2', 'info_dict': { 'id': '576a1fca-3cbd-48d7-be2f-e6dfc62a39d2', 'ext': 'mp3', 'title': 'Echo der Zeit vom 21.02.2017 18:00:00', - 'description': 'md5:a23d6a67d203083f4b044f88b54020d4', + 'description': None, 'duration': 2419.07, 'upload_date': '20170221', 'timestamp': 1487696400, @@ -334,7 +344,7 @@ class SRGSSRPlayIE(InfoExtractor): 'skip_download': True, }, }, { - # Audio story, but not the whole episode + # Audio story of the show in the previous test, but not the whole episode 'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/slowenisch-oesterreichischer-nachbarschaftsstreit?id=03f76721-90b8-4d7f-8c14-176e4c4c4308', 'info_dict': { 'id': '03f76721-90b8-4d7f-8c14-176e4c4c4308', @@ -344,7 +354,7 @@ class SRGSSRPlayIE(InfoExtractor): 'duration': 182.387, 'upload_date': '20170221', 'timestamp': 1487696400, - 'thumbnail': r're:https://.*448788\.170221_echo_slowenien-miro-cerar-624\.jpg', + 'thumbnail': r're:^https://.*448788\.170221_echo_slowenien-miro-cerar-624\.jpg$', }, 'params': { 'skip_download': True,