1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-01-25 03:53:00 +08:00

[srgssr] Migrate to integrationlayer version 2.0 API

The SRGSSR Play websites now often uses the integrationlayer version 2.0
API instead of version 1.0. I have modified the SRGSSR information
extractor to use this new integrationlayer instead of the old one. All
the old media supports this new version too, so there is no need to
stick with the old one. It's possible, that the support for the old
integrationlayer will be dropped, so this switch has to made anyway.

Here is a list of the changes:
 - Use integrationlayer version 2.0 API instead of version 1.0.
 - Assure consistant media IDs. In the old version of the information
   extractor, youtube-dl extracts the same video for the urls
   "http://www.srf.ch/play/tv/schweiz-aktuell/video/schweiz-aktuell-vom-22-02-2017?id=d0206674-6125-49ef-b85d-3cf36d24d582"
   and
   "http://www.srf.ch/play/tv/schweiz-aktuell/video/walliser-baubaubranche-wehrt-sich?id=967590f0-f812-4941-8f6a-06a2db7bd083",
   but uses different media IDs. Now it still extracts the same videos
   (since there is no support to cut videos into parts in youtube-dl,
   right?), but it uses the same media IDs. So we always have consistant
   media IDs for the media.
 - Add extraction of media duration.
 - Add extraction of video subtitles.
 - Use multiline regular expressions for _VALID_URL for better
   readability.
 - Indicate direct podcast downloads in format_id.
 - Update tests.
This commit is contained in:
Alex Seiler 2017-02-22 20:23:15 +01:00
parent f34b841b51
commit 41a33b2357

View File

@ -7,13 +7,36 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
ExtractorError,
float_or_none,
parse_iso8601,
str_or_none,
qualities,
)
class SRGSSRIE(InfoExtractor):
_VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)'
_VALID_URL = r'''(?x)
(?:
https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|
srgssr
):
(?P<bu>
srf|
rts|
rsi|
rtr|
swi
):(?:[^:]+:)?
(?P<type>
video|
audio
):
(?P<id>
[0-9a-f\-]{36}|
\d+
)
'''
_GEO_BYPASS = False
_GEO_COUNTRIES = ['CH']
@ -36,61 +59,122 @@ class SRGSSRIE(InfoExtractor):
url += '?' + auth_params
return url
def get_media_data(self, bu, media_type, media_id):
def get_media_id_and_data(self, bu, media_type, url_id):
media_data = self._download_json(
'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id),
media_id)[media_type.capitalize()]
'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' % (bu, media_type, url_id), url_id)
media_id = self._search_regex(
r'urn:%s:%s:([0-9a-f\-]{36}|\d+)' % (bu, media_type),
media_data['chapterUrn'],
'media id')
if media_data.get('block') and media_data['block'] in self._ERRORS:
message = self._ERRORS[media_data['block']]
if media_data['block'] == 'GEOBLOCK':
episode_data = media_data.get('episode', {})
chapter_list = media_data.get('chapterList', [])
data = []
if chapter_list:
data.extend([item for item in chapter_list if item.get('id') == media_id])
if not data:
raise ExtractorError('%s said: Cannot extract chapter information.' % self.IE_NAME)
chapter_data = data[0]
block_reason = str_or_none(chapter_data.get('blockReason'))
if block_reason and block_reason in self._ERRORS:
message = self._ERRORS[block_reason]
if block_reason == 'GEOBLOCK':
self.raise_geo_restricted(
msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True)
elif block_reason:
message = 'This media is not available. Reason: %s' % block_reason
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message))
return media_id, episode_data, chapter_data
def _get_subtitles(self, bu, media_data):
subtitles = {}
subtitle_data = media_data.get('subtitleList', [])
default_language_codes = {
'srf': 'de',
'rts': 'fr',
'rsi': 'it',
'rtr': 'rm',
'swi': 'en',
}
known_formats = ('TTML', 'VTT')
for sub in subtitle_data:
form = sub['format']
if form not in known_formats:
continue
lang = sub.get('locale') or default_language_codes[bu]
subtitles.setdefault(lang, []).append({
'ext': form.lower(),
'url': sub['url']
})
# Prefer VTT subtitles over TTML:
priorities = {
'ttml': 1,
'vtt': 2,
}
for lang in subtitles:
subtitles[lang].sort(key=lambda x: priorities[x['ext']])
return subtitles
return media_data
def _real_extract(self, url):
bu, media_type, media_id = re.match(self._VALID_URL, url).groups()
bu, media_type, url_id = re.match(self._VALID_URL, url).groups()
media_data = self.get_media_data(bu, media_type, media_id)
media_id, episode_data, chapter_data = self.get_media_id_and_data(bu, media_type, url_id)
metadata = media_data['AssetMetadatas']['AssetMetadata'][0]
title = metadata['title']
description = metadata.get('description')
created_date = media_data.get('createdDate') or metadata.get('createdDate')
is_episode = True if chapter_data['position'] == 0 else False
title = episode_data['title'] if is_episode else chapter_data['title']
description = chapter_data.get('description')
duration = float_or_none(chapter_data['duration'], scale=1000)
created_date = chapter_data.get('date')
timestamp = parse_iso8601(created_date)
thumbnails = [{
'id': image.get('id'),
'url': image['url'],
} for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])]
thumbnail = chapter_data.get('imageUrl')
subtitles = self._get_subtitles(bu, chapter_data)
preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD'])
formats = []
for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []):
protocol = source.get('@protocol')
for asset in source['url']:
asset_url = asset['text']
quality = asset['@quality']
format_id = '%s-%s' % (protocol, quality)
if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'):
asset_url = self._get_tokenized_src(asset_url, media_id, format_id)
if protocol.startswith('HTTP-HDS'):
formats.extend(self._extract_f4m_formats(
asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0',
media_id, f4m_id=format_id, fatal=False))
elif protocol.startswith('HTTP-HLS'):
formats.extend(self._extract_m3u8_formats(
asset_url, media_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False))
for source in chapter_data.get('resourceList', []):
protocol = str_or_none(source['protocol'])
quality = str_or_none(source['quality'])
encoding = str_or_none(source['encoding'])
format_url = source.get('url')
format_id = '%s-%s-%s' % (protocol, encoding, quality)
if protocol in ('HDS', 'HLS'):
format_url = self._get_tokenized_src(format_url, media_id, format_id)
if protocol == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0',
media_id, f4m_id=format_id, fatal=False))
else:
formats.extend(self._extract_m3u8_formats(
format_url, media_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False))
elif protocol in ('HTTP', 'HTTPS', 'RTMP'):
formats.append({
'format_id': format_id,
'ext': encoding.lower() if encoding else None,
'url': format_url,
'preference': preference(quality)
})
podcast_keys = ('podcastSdUrl', 'podcastHdUrl')
podcast_qualities = ('SD', 'HD')
if chapter_data['position'] == 0:
for key, quality in zip(podcast_keys, podcast_qualities):
if chapter_data.get(key):
formats.append({
'format_id': format_id,
'url': asset_url,
'format_id': 'PODCAST-HTTP-%s' % quality,
'url': chapter_data[key],
'preference': preference(quality),
'ext': 'flv' if protocol == 'RTMP' else None,
})
self._sort_formats(formats)
@ -98,71 +182,174 @@ class SRGSSRIE(InfoExtractor):
'id': media_id,
'title': title,
'description': description,
'duration': duration,
'timestamp': timestamp,
'thumbnails': thumbnails,
'thumbnail': thumbnail,
'subtitles': subtitles,
'formats': formats,
}
class SRGSSRPlayIE(InfoExtractor):
IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites'
_VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)'
_VALID_URL = r'''(?x)
https?://
(?:
(?:
www|
play
)\.
)?
(?P<bu>
srf|
rts|
rsi|
rtr|
swissinfo
)\.ch/play/
(?:
tv|
radio
)/[^/]+/
(?P<type>
video|
audio
)/[^?]+\?id=
(?P<id>
[0-9a-f\-]{36}|
\d+
)
'''
_TESTS = [{
# ID in url not the same as media ID, no Save button, no description
'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
'md5': 'da6b5b3ac9fa4761a942331cef20fcb3',
'info_dict': {
'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',
'id': '0d181f8f-07ae-46b2-8d3b-7619c0efb0e3',
'ext': 'mp4',
'title': '10vor10 vom 01.07.2013',
'description': None,
'duration': 1489.921,
'upload_date': '20130701',
'title': 'Snowden beantragt Asyl in Russland',
'timestamp': 1372713995,
}
}, {
# No Speichern (Save) button
'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa',
'md5': '0a274ce38fda48c53c01890651985bc6',
'info_dict': {
'id': '677f5829-e473-4823-ac83-a1087fe97faa',
'ext': 'flv',
'upload_date': '20130710',
'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive',
'description': 'md5:88604432b60d5a38787f152dec89cd56',
'timestamp': 1373493600,
'timestamp': 1372708215,
'thumbnail': r're:^https?://.*1436737120\.png$',
},
}, {
# ID in url is the same as video ID, with Save button, german TTML and VTT subtitles (default language)
'url': 'http://www.srf.ch/play/tv/rundschau/video/schwander-rot-gruene-stadtpolitik-min-li-marti-tamilen-kirche?id=2da578e3-dbb4-4657-a539-f01089a67831',
'md5': 'b32af364dc9821af183da8dc1433da56',
'info_dict': {
'id': '2da578e3-dbb4-4657-a539-f01089a67831',
'ext': 'mp4',
'title': 'Schwander, Rot-Grüne Stadtpolitik, Min Li Marti, Tamilen-Kirche',
'description': 'Verbissener Kampf / Vertreibung der Büezer / Theke: Min Li Marti / Geldsegen für den Pastor',
'duration': 2630.0,
'upload_date': '20170208',
'timestamp': 1486583589,
'thumbnail': r're:^https?://.*1486587225\.png$',
'subtitles': {
'de': [{
'ext': 'ttml',
'url': 're:^https://.*\.ttml$',
}, {
'ext': 'vtt',
'url': 're:^https://.*\.vtt$',
}]
},
},
}, {
# Audio media with RTMP stream
'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc',
'info_dict': {
'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc',
'ext': 'mp3',
'upload_date': '20151013',
'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem',
'timestamp': 1444750398,
'upload_date': '20151013',
'timestamp': 1444709160,
'thumbnail': r're:^https?://.*1453369436\.jpg$',
},
'params': {
# rtmp download
'skip_download': True,
},
}, {
# Video with french TTML subtitles (default language)
'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260',
'md5': '67a2a9ae4e8e62a68d0e9820cc9782df',
'info_dict': {
'id': '6348260',
'display_id': '6348260',
'ext': 'mp4',
'duration': 1796,
'title': 'Le 19h30',
'description': '',
'uploader': '19h30',
'upload_date': '20141201',
'timestamp': 1417458600,
'thumbnail': r're:^https?://.*\.image',
'view_count': int,
'thumbnail': r're:^https?://.*image',
'subtitles': {
'fr': [{
'ext': 'ttml',
'url': 're:^https://.*\.xml$'
}]
},
},
'params': {
# m3u8 download
'skip_download': True,
}
},
}, {
# Video with many subtitles in different languages (explicit language definitions)
'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270',
'info_dict': {
'id': '42960270',
'ext': 'mp4',
'title': 'Why people were against tax reforms',
'description': 'md5:8c5c1b6a2a37c17670cf87f608ff4755',
'upload_date': '20170215',
'timestamp': 1487173560,
'thumbnail': 'https://www.swissinfo.ch/srgscalableimage/42961964',
'subtitles': {
'ar': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
'de': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
'en': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
'es': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
'fr': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
'it': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
'ru': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
'zh': [{'ext': 'vtt', 'url': 're:^https://.*\.vtt$'}],
},
},
'params': {
'skip_download': True,
},
}, {
# Audio, whole episode
'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/annur-moschee---rachezug-gegen-informanten?id=576a1fca-3cbd-48d7-be2f-e6dfc62a39d2',
'info_dict': {
'id': '576a1fca-3cbd-48d7-be2f-e6dfc62a39d2',
'ext': 'mp3',
'title': 'Echo der Zeit vom 21.02.2017 18:00:00',
'description': 'md5:a23d6a67d203083f4b044f88b54020d4',
'duration': 2419.07,
'upload_date': '20170221',
'timestamp': 1487696400,
'thumbnail': r're:https://.*448775\.170221_echo_annur-winterthur-624\.jpg',
},
'params': {
'skip_download': True,
},
}, {
# Audio story, but not the whole episode
'url': 'http://www.srf.ch/play/radio/echo-der-zeit/audio/slowenisch-oesterreichischer-nachbarschaftsstreit?id=03f76721-90b8-4d7f-8c14-176e4c4c4308',
'info_dict': {
'id': '03f76721-90b8-4d7f-8c14-176e4c4c4308',
'ext': 'mp3',
'title': 'Slowenisch-österreichischer Nachbarschaftsstreit',
'description': 'md5:4f3c5a60e12759afe578c901bbcaa574',
'duration': 182.387,
'upload_date': '20170221',
'timestamp': 1487696400,
'thumbnail': r're:https://.*448788\.170221_echo_slowenien-miro-cerar-624\.jpg',
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):