From 0b16b3c2d35d1706ec5c55e5b06352c753127368 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 09:22:24 +0100 Subject: [PATCH 01/14] [twitch] add support for Clip embed URLs --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ca7676fe2..a5681409c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -644,7 +644,7 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -667,6 +667,9 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', 'only_matching': True, + }, { + 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', + 'only_matching': True, }] def _real_extract(self, url): From 18ca61c5e153d1c1cb8b9a2de3c8b9dfdaa69b0e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 09:23:20 +0100 Subject: [PATCH 02/14] [twitter] improve extraction - add support for generic embeds(closes #22168) - always extract http formats for native videos(closes #14934) - add support for Twitter Broadcasts(closes #21369) - extract more metadata - improve VMap format extraction - unify extraction code for both twitter statuses and cards --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/periscope.py | 80 ++-- youtube_dl/extractor/twitter.py | 570 +++++++++++++++-------------- 3 files changed, 344 insertions(+), 307 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2f9ba6893..598006061 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1241,6 +1241,7 @@ from .twitter import ( TwitterCardIE, TwitterIE, TwitterAmplifyIE, + TwitterBroadcastIE, ) from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index b337a56c0..c02e34aba 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -17,12 +17,54 @@ class PeriscopeBaseIE(InfoExtractor): 'https://api.periscope.tv/api/v2/%s' % method, item_id, query=query) + def _parse_broadcast_data(self, broadcast, video_id): + title = broadcast['status'] + uploader = broadcast.get('user_display_name') or broadcast.get('username') + title = '%s - %s' % (uploader, title) if uploader else title + is_live = broadcast.get('state').lower() == 'running' + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'title': self._live_title(title) if is_live else title, + 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'uploader': uploader, + 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(broadcast.get('total_watched')), + 'tags': broadcast.get('tags'), + 'is_live': is_live, + } + + @staticmethod + def _extract_common_format_info(broadcast): + return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) + + @staticmethod + def _add_width_and_height(f, width, height): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=fatal) + if len(m3u8_formats) == 1: + self._add_width_and_height(m3u8_formats[0], width, height) + return m3u8_formats + class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P[^/?#]+)' - # Alive example URLs can be found here http://onperiscope.com/ + # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', @@ -61,21 +103,9 @@ class PeriscopeIE(PeriscopeBaseIE): 'accessVideoPublic', {'broadcast_id': token}, token) broadcast = stream['broadcast'] - title = broadcast['status'] + info = self._parse_broadcast_data(broadcast, token) - uploader = broadcast.get('user_display_name') or broadcast.get('username') - uploader_id = (broadcast.get('user_id') or broadcast.get('username')) - - title = '%s - %s' % (uploader, title) if uploader else title state = broadcast.get('state').lower() - if state == 'running': - title = self._live_title(title) - timestamp = parse_iso8601(broadcast.get('created_at')) - - thumbnails = [{ - 'url': broadcast[image], - } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - width = int_or_none(broadcast.get('width')) height = int_or_none(broadcast.get('height')) @@ -92,32 +122,20 @@ class PeriscopeIE(PeriscopeBaseIE): continue video_urls.add(video_url) if format_id != 'rtmp': - m3u8_formats = self._extract_m3u8_formats( - video_url, token, 'mp4', - entry_protocol='m3u8_native' - if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=False) - if len(m3u8_formats) == 1: - add_width_and_height(m3u8_formats[0]) + m3u8_formats = self._extract_pscp_m3u8_formats( + video_url, token, format_id, state, width, height, False) formats.extend(m3u8_formats) continue rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', } - add_width_and_height(rtmp_format) + self._add_width_and_height(rtmp_format) formats.append(rtmp_format) self._sort_formats(formats) - return { - 'id': broadcast.get('id') or token, - 'title': title, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'thumbnails': thumbnails, - 'formats': formats, - } + info['formats'] = formats + return info class PeriscopeUserIE(PeriscopeBaseIE): diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index cebb6238c..5f8d90fb4 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -4,32 +4,67 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) from ..utils import ( - determine_ext, dict_get, ExtractorError, float_or_none, int_or_none, - remove_end, try_get, + strip_or_none, + unified_timestamp, + update_url_query, xpath_text, ) -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeBaseIE, + PeriscopeIE, +) class TwitterBaseIE(InfoExtractor): + _API_BASE = 'https://api.twitter.com/1.1/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' + _GUEST_TOKEN = None + + def _extract_variant_formats(self, variant, video_id): + variant_url = variant.get('url') + if not variant_url: + return [] + elif '.m3u8' in variant_url: + return self._extract_m3u8_formats( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + else: + tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None + f = { + 'url': variant_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + } + self._search_dimensions_in_video_url(f, variant_url) + return [f] + def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) - video_url = xpath_text(vmap_data, './/MediaFile').strip() - if determine_ext(video_url) == 'm3u8': - return self._extract_m3u8_formats( - video_url, video_id, ext='mp4', m3u8_id='hls', - entry_protocol='m3u8_native') - return [{ - 'url': video_url, - }] + formats = [] + urls = [] + for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): + video_variant.attrib['url'] = compat_urllib_parse_unquote( + video_variant.attrib['url']) + urls.append(video_variant.attrib['url']) + formats.extend(self._extract_variant_formats( + video_variant.attrib, video_id)) + video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) + if video_url not in urls: + formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) + return formats @staticmethod def _search_dimensions_in_video_url(a_format, video_url): @@ -40,10 +75,30 @@ class TwitterBaseIE(InfoExtractor): 'height': int(m.group('height')), }) + def _call_api(self, path, video_id, query={}): + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + } + if not self._GUEST_TOKEN: + self._GUEST_TOKEN = self._download_json( + self._API_BASE + 'guest/activate.json', video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = self._GUEST_TOKEN + try: + return self._download_json( + self._API_BASE + path, video_id, headers=headers, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), + video_id)['errors'][0]['message'], expected=True) + raise -class TwitterCardIE(TwitterBaseIE): + +class TwitterCardIE(InfoExtractor): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?Pcards/tfw/v1|videos(?:/tweet)?)/(?P\d+)' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -51,19 +106,28 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.", + 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96', + 'uploader': 'Twitter', + 'uploader_id': 'Twitter', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 30.033, + 'timestamp': 1422366112, + 'upload_date': '20150127', }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', - 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'md5': '7137eca597f72b9abbe61e5ae0161399', 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*$', + 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.", + 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA", + 'uploader': 'NASA', + 'uploader_id': 'NASA', + 'timestamp': 1437408129, + 'upload_date': '20150720', }, }, { @@ -75,7 +139,7 @@ class TwitterCardIE(TwitterBaseIE): 'title': 'Ubuntu 11.10 Overview', 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', 'upload_date': '20111013', - 'uploader': 'OMG! Ubuntu!', + 'uploader': 'OMG! UBUNTU!', 'uploader_id': 'omgubuntu', }, 'add_ie': ['Youtube'], @@ -99,190 +163,30 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", + 'uploader': 'Brent Yarina', + 'uploader_id': 'BTNBrentYarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, + 'skip': 'This content is no longer available.', }, { 'url': 'https://twitter.com/i/videos/752274308186120192', 'only_matching': True, }, ] - _API_BASE = 'https://api.twitter.com/1.1' - - def _parse_media_info(self, media_info, video_id): - formats = [] - for media_variant in media_info.get('variants', []): - media_url = media_variant['url'] - if media_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) - elif media_url.endswith('.mpd'): - formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) - else: - tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) - a_format = { - 'url': media_url, - 'format_id': 'http-%d' % tbr if tbr else 'http', - 'tbr': tbr, - } - # Reported bitRate may be zero - if not a_format['tbr']: - del a_format['tbr'] - - self._search_dimensions_in_video_url(a_format, media_url) - - formats.append(a_format) - return formats - - def _extract_mobile_formats(self, username, video_id): - webpage = self._download_webpage( - 'https://mobile.twitter.com/%s/status/%s' % (username, video_id), - video_id, 'Downloading mobile webpage', - headers={ - # A recent mobile UA is necessary for `gt` cookie - 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0', - }) - main_script_url = self._html_search_regex( - r']+src="([^"]+main\.[^"]+)"', webpage, 'main script URL') - main_script = self._download_webpage( - main_script_url, video_id, 'Downloading main script') - bearer_token = self._search_regex( - r'BEARER_TOKEN\s*:\s*"([^"]+)"', - main_script, 'bearer token') - # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id - api_data = self._download_json( - '%s/statuses/show/%s.json' % (self._API_BASE, video_id), - video_id, 'Downloading API data', - headers={ - 'Authorization': 'Bearer ' + bearer_token, - }) - media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {} - return self._parse_media_info(media_info, video_id) - def _real_extract(self, url): - path, video_id = re.search(self._VALID_URL, url).groups() - - config = None - formats = [] - duration = None - - urls = [url] - if path.startswith('cards/'): - urls.append('https://twitter.com/i/videos/' + video_id) - - for u in urls: - webpage = self._download_webpage( - u, video_id, headers={'Referer': 'https://twitter.com/'}) - - iframe_url = self._html_search_regex( - r']+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', - webpage, 'video iframe', default=None) - if iframe_url: - return self.url_result(iframe_url) - - config = self._parse_json(self._html_search_regex( - r'data-(?:player-)?config="([^"]+)"', webpage, - 'data player config', default='{}'), - video_id) - - if config.get('source_type') == 'vine': - return self.url_result(config['player_url'], 'Vine') - - periscope_url = PeriscopeIE._extract_url(webpage) - if periscope_url: - return self.url_result(periscope_url, PeriscopeIE.ie_key()) - - video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') - - if video_url: - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) - else: - f = { - 'url': video_url, - } - - self._search_dimensions_in_video_url(f, video_url) - - formats.append(f) - - vmap_url = config.get('vmapUrl') or config.get('vmap_url') - if vmap_url: - formats.extend( - self._extract_formats_from_vmap_url(vmap_url, video_id)) - - media_info = None - - for entity in config.get('status', {}).get('entities', []): - if 'mediaInfo' in entity: - media_info = entity['mediaInfo'] - - if media_info: - formats.extend(self._parse_media_info(media_info, video_id)) - duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) - - username = config.get('user', {}).get('screen_name') - if username: - formats.extend(self._extract_mobile_formats(username, video_id)) - - if formats: - title = self._search_regex(r'([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - break - - if not formats: - headers = { - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', - 'Referer': url, - } - ct0 = self._get_cookies(url).get('ct0') - if ct0: - headers['csrf_token'] = ct0.value - guest_token = self._download_json( - '%s/guest/activate.json' % self._API_BASE, video_id, - 'Downloading guest token', data=b'', - headers=headers)['guest_token'] - headers['x-guest-token'] = guest_token - self._set_cookie('api.twitter.com', 'gt', guest_token) - config = self._download_json( - '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), - video_id, headers=headers) - track = config['track'] - vmap_url = track.get('vmapUrl') - if vmap_url: - formats = self._extract_formats_from_vmap_url(vmap_url, video_id) - else: - playback_url = track['playbackUrl'] - if determine_ext(playback_url) == 'm3u8': - formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - else: - formats = [{ - 'url': playback_url, - }] - title = 'Twitter web player' - thumbnail = config.get('posterImage') - duration = float_or_none(track.get('durationMs'), scale=1000) - - self._remove_duplicate_formats(formats) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + status_id = self._match_id(url) + return self.url_result( + 'https://twitter.com/statuses/' + status_id, + TwitterIE.ie_key(), status_id) -class TwitterIE(InfoExtractor): +class TwitterIE(TwitterBaseIE): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P[^/]+))/status/(?P\d+)' - _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' - _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P\d+)' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -291,10 +195,13 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', 'duration': 12.922, + 'timestamp': 1442188653, + 'upload_date': '20150913', + 'age_limit': 18, }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -316,19 +223,23 @@ class TwitterIE(InfoExtractor): 'id': '665052190608723968', 'ext': 'mp4', 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', - 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', + 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'uploader_id': 'starwars', 'uploader': 'Star Wars', + 'timestamp': 1447395772, + 'upload_date': '20151113', }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', - 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", 'uploader_id': 'BTNBrentYarina', 'uploader': 'Brent Yarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, 'params': { # The same video as https://twitter.com/i/videos/tweet/705235433198714880 @@ -340,12 +251,14 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'Simon Vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'JG', - 'uploader_id': 'jaydingeer', + 'uploader': 'Simon Vertugo', + 'uploader_id': 'simonvertugo', 'duration': 30.0, + 'timestamp': 1455777459, + 'upload_date': '20160218', }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -353,10 +266,9 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'Vince Mancini - Vine of the day', - 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', - 'uploader': 'Vince Mancini', - 'uploader_id': 'Filmdrunk', + 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', + 'uploader': 'TAKUMA', + 'uploader_id': '1004126642786242560', 'timestamp': 1402826626, 'upload_date': '20140615', }, @@ -367,21 +279,22 @@ class TwitterIE(InfoExtractor): 'id': '719944021058060289', 'ext': 'mp4', 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', - 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', - 'uploader_id': 'captainamerica', + 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', + 'uploader_id': 'CaptainAmerica', 'uploader': 'Captain America', 'duration': 3.17, + 'timestamp': 1460483005, + 'upload_date': '20160412', }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', 'info_dict': { 'id': '1zqKVVlkqLaKB', 'ext': 'mp4', - 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence', - 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"', + 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', 'upload_date': '20160923', - 'uploader_id': 'OPP_HSD', - 'uploader': 'Sgt Kerry Schmidt', + 'uploader_id': '1PmKqpJdOJQoY', + 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', 'timestamp': 1474613214, }, 'add_ie': ['Periscope'], @@ -392,10 +305,12 @@ class TwitterIE(InfoExtractor): 'id': '852138619213144067', 'ext': 'mp4', 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', - 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', + 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', 'uploader': 'عالم الأخبار', 'uploader_id': 'news_al3alm', 'duration': 277.4, + 'timestamp': 1492000653, + 'upload_date': '20170412', }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', @@ -404,10 +319,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo', 'uploader': 'Préfet de Guadeloupe', 'uploader_id': 'Prefet971', 'duration': 47.48, + 'timestamp': 1505803395, + 'upload_date': '20170919', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -420,10 +337,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 're:.*?Shep is on a roll today.*?', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09', 'uploader': 'Lis Power', 'uploader_id': 'LisPower1', 'duration': 111.278, + 'timestamp': 1527623489, + 'upload_date': '20180529', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -435,88 +354,163 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:66d493500c013e3e2d434195746a7f78', + 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', 'uploader': 'Twitter', 'uploader_id': 'Twitter', 'duration': 61.567, + 'timestamp': 1548184644, + 'upload_date': '20190122', }, + }, { + # not available in Periscope + 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', + 'info_dict': { + 'id': '1vOGwqejwoWxB', + 'ext': 'mp4', + 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', + 'uploader': 'Vivi', + 'uploader_id': '1eVjYOLGkGrQL', + }, + 'add_ie': ['TwitterBroadcast'], + }, { + # Twitch Clip Embed + 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - twid = mobj.group('id') - - webpage, urlh = self._download_webpage_handle( - self._TEMPLATE_STATUSES_URL % twid, twid) - - if 'twitter.com/account/suspended' in urlh.geturl(): - raise ExtractorError('Account suspended by Twitter.', expected=True) - - user_id = None - - redirect_mobj = re.match(self._VALID_URL, urlh.geturl()) - if redirect_mobj: - user_id = redirect_mobj.group('user_id') - - if not user_id: - user_id = mobj.group('user_id') - - username = remove_end(self._og_search_title(webpage), ' on Twitter') - - title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') + twid = self._match_id(url) + status = self._call_api( + 'statuses/show/%s.json' % twid, twid, { + 'cards_platform': 'Web-12', + 'include_cards': 1, + 'include_reply_count': 1, + 'include_user_entities': 0, + 'tweet_mode': 'extended', + }) + title = description = status['full_text'].replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) + user = status.get('user') or {} + uploader = user.get('name') + if uploader: + title = '%s - %s' % (uploader, title) + uploader_id = user.get('screen_name') + + tags = [] + for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []): + hashtag_text = hashtag.get('text') + if not hashtag_text: + continue + tags.append(hashtag_text) info = { - 'uploader_id': user_id, - 'uploader': username, - 'webpage_url': url, - 'description': '%s on Twitter: "%s"' % (username, description), - 'title': username + ' - ' + title, + 'id': twid, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': unified_timestamp(status.get('created_at')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None, + 'like_count': int_or_none(status.get('favorite_count')), + 'repost_count': int_or_none(status.get('retweet_count')), + 'comment_count': int_or_none(status.get('reply_count')), + 'age_limit': 18 if status.get('possibly_sensitive') else 0, + 'tags': tags, } - mobj = re.search(r'''(?x) - ]+class="animated-gif"(?P[^>]+)>\s* - ]+video-src="(?P[^"]+)" - ''', webpage) + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + video_info = media.get('video_info') or {} + + formats = [] + for variant in video_info.get('variants', []): + formats.extend(self._extract_variant_formats(variant, twid)) + self._sort_formats(formats) + + thumbnails = [] + media_url = media.get('media_url_https') or media.get('media_url') + if media_url: + def add_thumbnail(name, size): + thumbnails.append({ + 'id': name, + 'url': update_url_query(media_url, {'name': name}), + 'width': int_or_none(size.get('w') or size.get('width')), + 'height': int_or_none(size.get('h') or size.get('height')), + }) + for name, size in media.get('sizes', {}).items(): + add_thumbnail(name, size) + add_thumbnail('orig', media.get('original_info') or {}) - if mobj: - more_info = mobj.group('more_info') - height = int_or_none(self._search_regex( - r'data-height="(\d+)"', more_info, 'height', fatal=False)) - width = int_or_none(self._search_regex( - r'data-width="(\d+)"', more_info, 'width', fatal=False)) - thumbnail = self._search_regex( - r'poster="([^"]+)"', more_info, 'poster', fatal=False) info.update({ - 'id': twid, - 'url': mobj.group('url'), - 'height': height, - 'width': width, - 'thumbnail': thumbnail, + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) - return info - - twitter_card_url = None - if 'class="PlayableMedia' in webpage: - twitter_card_url = '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid) else: - twitter_card_iframe_url = self._search_regex( - r'data-full-card-iframe-url=([\'"])(?P(?:(?!\1).)+)\1', - webpage, 'Twitter card iframe URL', default=None, group='url') - if twitter_card_iframe_url: - twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url) + card = status.get('card') + if card: + binding_values = card['binding_values'] - if twitter_card_url: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', - 'url': twitter_card_url, - }) - return info + def get_binding_value(k): + o = binding_values.get(k) or {} + return try_get(o, lambda x: x[x['type'].lower() + '_value']) - raise ExtractorError('There\'s no video in this tweet.') + card_name = card['name'].split(':')[-1] + if card_name == 'amplify': + formats = self._extract_formats_from_vmap_url( + get_binding_value('amplify_url_vmap'), + get_binding_value('amplify_content_id') or twid) + self._sort_formats(formats) + + thumbnails = [] + for suffix in ('_small', '', '_large', '_x_large', '_original'): + image = get_binding_value('player_image' + suffix) or {} + image_url = image.get('url') + if not image_url or '/player-placeholder' in image_url: + continue + thumbnails.append({ + 'id': suffix[1:] if suffix else 'medium', + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': int_or_none(get_binding_value( + 'content_duration_seconds')), + }) + elif card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + else: + raise ExtractorError('Unsupported Twitter Card.') + else: + expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) + if not expanded_url: + raise ExtractorError("There's no video in this tweet.") + info.update({ + '_type': 'url', + 'url': expanded_url, + }) + return info class TwitterAmplifyIE(TwitterBaseIE): @@ -573,3 +567,27 @@ class TwitterAmplifyIE(TwitterBaseIE): 'formats': formats, 'thumbnails': thumbnails, } + + +class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): + IE_NAME = 'twitter:broadcast' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P[0-9a-zA-Z]{13})' + + def _real_extract(self, url): + broadcast_id = self._match_id(url) + broadcast = self._call_api( + 'broadcasts/show.json', broadcast_id, + {'ids': broadcast_id})['broadcasts'][broadcast_id] + info = self._parse_broadcast_data(broadcast, broadcast_id) + media_key = broadcast['media_key'] + source = self._call_api( + 'live_video_stream/status/' + media_key, media_key)['source'] + m3u8_url = source.get('noRedirectPlaybackUrl') or source['location'] + if '/live_video_stream/geoblocked/' in m3u8_url: + self.raise_geo_restricted() + m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse( + m3u8_url).query).get('type', [None])[0] + state, width, height = self._extract_common_format_info(broadcast) + info['formats'] = self._extract_pscp_m3u8_formats( + m3u8_url, broadcast_id, m3u8_id, state, width, height) + return info From ce112a8c19ebcc9d401ff26a5cdcf58ba565901c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 11:01:07 +0100 Subject: [PATCH 03/14] [twitch] fix video comments URL(#18593)(closes #15828) --- youtube_dl/extractor/twitch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index a5681409c..8c0d70010 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -344,9 +344,8 @@ class TwitchVodIE(TwitchItemBaseIE): info['subtitles'] = { 'rechat': [{ 'url': update_url_query( - 'https://rechat.twitch.tv/rechat-messages', { - 'video_id': 'v%s' % item_id, - 'start': info['timestamp'], + 'https://api.twitch.tv/v5/videos/%s/comments' % item_id, { + 'client_id': self._CLIENT_ID, }), 'ext': 'json', }], From f81dd65ba2c1e7be549e5c8cfe6cbf0f0829edfe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 13:11:59 +0100 Subject: [PATCH 04/14] [extractor/common] clean jwplayer description HTML tags --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4a683f6d6..4c2f9303e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2689,7 +2689,7 @@ class InfoExtractor(object): entry = { 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), - 'description': video_data.get('description'), + 'description': clean_html(video_data.get('description')), 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), From 8fbf5d2f87fbfe0441bc20cf69d506109b2810bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 13:14:23 +0100 Subject: [PATCH 05/14] [seeker] remove Revision3 extractors and fix extraction --- youtube_dl/extractor/extractors.py | 4 - youtube_dl/extractor/revision3.py | 170 ----------------------------- youtube_dl/extractor/seeker.py | 45 ++++---- 3 files changed, 23 insertions(+), 196 deletions(-) delete mode 100644 youtube_dl/extractor/revision3.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 598006061..8df9d95b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -932,10 +932,6 @@ from .rentv import ( from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE -from .revision3 import ( - Revision3EmbedIE, - Revision3IE, -) from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py deleted file mode 100644 index 833d8a2f0..000000000 --- a/youtube_dl/extractor/revision3.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_iso8601, - unescapeHTML, - qualities, -) - - -class Revision3EmbedIE(InfoExtractor): - IE_NAME = 'revision3:embed' - _VALID_URL = r'(?:revision3:(?:(?P[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P\d+)' - _TEST = { - 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558', - 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', - 'info_dict': { - 'id': '67558', - 'ext': 'mp4', - 'title': 'The Pros & Cons Of Zoos', - 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', - 'uploader_id': 'dnews', - 'uploader': 'DNews', - } - } - _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('playlist_id') - playlist_type = mobj.group('playlist_type') or 'video_id' - video_data = self._download_json( - 'http://revision3.com/api/getPlaylist.json', playlist_id, query={ - 'api_key': self._API_KEY, - 'codecs': 'h264,vp8,theora', - playlist_type: playlist_id, - })['items'][0] - - formats = [] - for vcodec, media in video_data['media'].items(): - for quality_id, quality in media.items(): - if quality_id == 'hls': - formats.extend(self._extract_m3u8_formats( - quality['url'], playlist_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': quality['url'], - 'format_id': '%s-%s' % (vcodec, quality_id), - 'tbr': int_or_none(quality.get('bitrate')), - 'vcodec': vcodec, - }) - self._sort_formats(formats) - - return { - 'id': playlist_id, - 'title': unescapeHTML(video_data['title']), - 'description': unescapeHTML(video_data.get('summary')), - 'uploader': video_data.get('show', {}).get('name'), - 'uploader_id': video_data.get('show', {}).get('slug'), - 'duration': int_or_none(video_data.get('duration')), - 'formats': formats, - } - - -class Revision3IE(InfoExtractor): - IE_NAME = 'revision' - _VALID_URL = r'https?://(?:www\.)?(?P(?:revision3|animalist)\.com)/(?P[^/]+(?:/[^/?#]+)?)' - _TESTS = [{ - 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', - 'md5': 'd94a72d85d0a829766de4deb8daaf7df', - 'info_dict': { - 'id': '71089', - 'display_id': 'technobuffalo/5-google-predictions-for-2016', - 'ext': 'webm', - 'title': '5 Google Predictions for 2016', - 'description': 'Google had a great 2015, but it\'s already time to look ahead. Here are our five predictions for 2016.', - 'upload_date': '20151228', - 'timestamp': 1451325600, - 'duration': 187, - 'uploader': 'TechnoBuffalo', - 'uploader_id': 'technobuffalo', - } - }, { - # Show - 'url': 'http://revision3.com/variant', - 'only_matching': True, - }, { - # Tag - 'url': 'http://revision3.com/vr', - 'only_matching': True, - }] - _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' - - def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[0] - page_info = self._download_json( - self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) - - page_data = page_info['data'] - page_type = page_data['type'] - if page_type in ('episode', 'embed'): - show_data = page_data['show']['data'] - page_id = compat_str(page_data['id']) - video_id = compat_str(page_data['video']['data']['id']) - - preference = qualities(['mini', 'small', 'medium', 'large']) - thumbnails = [{ - 'url': image_url, - 'id': image_id, - 'preference': preference(image_id) - } for image_id, image_url in page_data.get('images', {}).items()] - - info = { - 'id': page_id, - 'display_id': display_id, - 'title': unescapeHTML(page_data['name']), - 'description': unescapeHTML(page_data.get('summary')), - 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '), - 'author': page_data.get('author'), - 'uploader': show_data.get('name'), - 'uploader_id': show_data.get('slug'), - 'thumbnails': thumbnails, - 'extractor_key': site, - } - - if page_type == 'embed': - info.update({ - '_type': 'url_transparent', - 'url': page_data['video']['data']['embed'], - }) - return info - - info.update({ - '_type': 'url_transparent', - 'url': 'revision3:%s' % video_id, - }) - return info - else: - list_data = page_info[page_type]['data'] - episodes_data = page_info['episodes']['data'] - num_episodes = page_info['meta']['totalEpisodes'] - processed_episodes = 0 - entries = [] - page_num = 1 - while True: - entries.extend([{ - '_type': 'url', - 'url': 'http://%s%s' % (domain, episode['path']), - 'id': compat_str(episode['id']), - 'ie_key': 'Revision3', - 'extractor_key': site, - } for episode in episodes_data]) - processed_episodes += len(episodes_data) - if processed_episodes == num_episodes: - break - page_num += 1 - episodes_data = self._download_json(self._PAGE_DATA_TEMPLATE % ( - domain, display_id + '/' + compat_str(page_num), domain), - display_id)['episodes']['data'] - - return self.playlist_result( - entries, compat_str(list_data['id']), - list_data.get('name'), list_data.get('summary')) diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py index 3b9c65e7e..7872dc80d 100644 --- a/youtube_dl/extractor/seeker.py +++ b/youtube_dl/extractor/seeker.py @@ -4,34 +4,37 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + get_element_by_class, + strip_or_none, +) class SeekerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P.*)-(?P\d+)\.html' _TESTS = [{ - # player.loadRevision3Item 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', - 'md5': '30c1dc4030cc715cf05b423d0947ac18', + 'md5': '897d44bbe0d8986a2ead96de565a92db', 'info_dict': { - 'id': '76243', - 'ext': 'webm', + 'id': 'Elrn3gnY', + 'ext': 'mp4', 'title': 'Should Trump Be Required To Release His Tax Returns?', - 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?', - 'uploader': 'Seeker Daily', - 'uploader_id': 'seekerdaily', + 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', + 'timestamp': 1490090165, + 'upload_date': '20170321', } }, { 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', 'playlist': [ { - 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'md5': '0497b9f20495174be73ae136949707d2', 'info_dict': { - 'id': '67558', + 'id': 'FihYQ8AE', 'ext': 'mp4', 'title': 'The Pros & Cons Of Zoos', - 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', - 'uploader': 'DNews', - 'uploader_id': 'dnews', + 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', + 'timestamp': 1490039133, + 'upload_date': '20170320', }, } ], @@ -45,13 +48,11 @@ class SeekerIE(InfoExtractor): def _real_extract(self, url): display_id, article_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage) - if mobj: - playlist_type, playlist_id = mobj.groups() - return self.url_result( - 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id) - else: - entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall( - r']+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)] - return self.playlist_result( - entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage)) + entries = [] + for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): + entries.append(self.url_result( + 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) + return self.playlist_result( + entries, article_id, + self._og_search_title(webpage), + strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) From 20baa17c0180c7254644abea968792abcf0743cb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 16:00:12 +0100 Subject: [PATCH 06/14] [daisuki] remove extractor --- youtube_dl/extractor/daisuki.py | 154 ----------------------------- youtube_dl/extractor/extractors.py | 4 - 2 files changed, 158 deletions(-) delete mode 100644 youtube_dl/extractor/daisuki.py diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py deleted file mode 100644 index dbc1aa5d4..000000000 --- a/youtube_dl/extractor/daisuki.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import unicode_literals - -import base64 -import json -import random -import re - -from .common import InfoExtractor -from ..aes import ( - aes_cbc_decrypt, - aes_cbc_encrypt, -) -from ..compat import compat_b64decode -from ..utils import ( - bytes_to_intlist, - bytes_to_long, - extract_attributes, - ExtractorError, - intlist_to_bytes, - js_to_json, - int_or_none, - long_to_bytes, - pkcs1pad, -) - - -class DaisukiMottoIE(InfoExtractor): - _VALID_URL = r'https?://motto\.daisuki\.net/framewatch/embed/[^/]+/(?P[0-9a-zA-Z]{3})' - - _TEST = { - 'url': 'http://motto.daisuki.net/framewatch/embed/embedDRAGONBALLSUPERUniverseSurvivalsaga/V2e/760/428', - 'info_dict': { - 'id': 'V2e', - 'ext': 'mp4', - 'title': '#117 SHOWDOWN OF LOVE! ANDROIDS VS UNIVERSE 2!!', - 'subtitles': { - 'mul': [{ - 'ext': 'ttml', - }], - }, - }, - 'params': { - 'skip_download': True, # AES-encrypted HLS stream - }, - } - - # The public key in PEM format can be found in clientlibs_anime_watch.min.js - _RSA_KEY = (0xc5524c25e8e14b366b3754940beeb6f96cb7e2feef0b932c7659a0c5c3bf173d602464c2df73d693b513ae06ff1be8f367529ab30bf969c5640522181f2a0c51ea546ae120d3d8d908595e4eff765b389cde080a1ef7f1bbfb07411cc568db73b7f521cedf270cbfbe0ddbc29b1ac9d0f2d8f4359098caffee6d07915020077d, 65537) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - flashvars = self._parse_json(self._search_regex( - r'(?s)var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), - video_id, transform_source=js_to_json) - - iv = [0] * 16 - - data = {} - for key in ('device_cd', 'mv_id', 'ss1_prm', 'ss2_prm', 'ss3_prm', 'ss_id'): - data[key] = flashvars.get(key, '') - - encrypted_rtn = None - - # Some AES keys are rejected. Try it with different AES keys - for idx in range(5): - aes_key = [random.randint(0, 254) for _ in range(32)] - padded_aeskey = intlist_to_bytes(pkcs1pad(aes_key, 128)) - - n, e = self._RSA_KEY - encrypted_aeskey = long_to_bytes(pow(bytes_to_long(padded_aeskey), e, n)) - init_data = self._download_json( - 'http://motto.daisuki.net/fastAPI/bgn/init/', - video_id, query={ - 's': flashvars.get('s', ''), - 'c': flashvars.get('ss3_prm', ''), - 'e': url, - 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( - bytes_to_intlist(json.dumps(data)), - aes_key, iv))).decode('ascii'), - 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), - }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) - - if 'rtn' in init_data: - encrypted_rtn = init_data['rtn'] - break - - self._sleep(5, video_id) - - if encrypted_rtn is None: - raise ExtractorError('Failed to fetch init data') - - rtn = self._parse_json( - intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist( - compat_b64decode(encrypted_rtn)), - aes_key, iv)).decode('utf-8').rstrip('\0'), - video_id) - - title = rtn['title_str'] - - formats = self._extract_m3u8_formats( - rtn['play_url'], video_id, ext='mp4', entry_protocol='m3u8_native') - - subtitles = {} - caption_url = rtn.get('caption_url') - if caption_url: - # mul: multiple languages - subtitles['mul'] = [{ - 'url': caption_url, - 'ext': 'ttml', - }] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - } - - -class DaisukiMottoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://motto\.daisuki\.net/(?Pinformation)/' - - _TEST = { - 'url': 'http://motto.daisuki.net/information/', - 'info_dict': { - 'title': 'DRAGON BALL SUPER', - }, - 'playlist_mincount': 117, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [] - for li in re.findall(r'(]+?data-product_id="[a-zA-Z0-9]{3}"[^>]+>)', webpage): - attr = extract_attributes(li) - ad_id = attr.get('data-ad_id') - product_id = attr.get('data-product_id') - if ad_id and product_id: - episode_id = attr.get('data-chapter') - entries.append({ - '_type': 'url_transparent', - 'url': 'http://motto.daisuki.net/framewatch/embed/%s/%s/760/428' % (ad_id, product_id), - 'episode_id': episode_id, - 'episode_number': int_or_none(episode_id), - 'ie_key': 'DaisukiMotto', - }) - - return self.playlist_result(entries, playlist_title='DRAGON BALL SUPER') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8df9d95b1..e2ebe8f95 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -254,10 +254,6 @@ from .dailymotion import ( DailymotionPlaylistIE, DailymotionUserIE, ) -from .daisuki import ( - DaisukiMottoIE, - DaisukiMottoPlaylistIE, -) from .daum import ( DaumIE, DaumClipIE, From 88b87b08b1ed06940053ee018547de051bf8d986 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 17:01:21 +0100 Subject: [PATCH 07/14] [minhateca] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/minhateca.py | 70 ------------------------------ 2 files changed, 71 deletions(-) delete mode 100644 youtube_dl/extractor/minhateca.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e2ebe8f95..dfd0ef198 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -625,7 +625,6 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) -from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py deleted file mode 100644 index dccc54249..000000000 --- a/youtube_dl/extractor/minhateca.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - parse_filesize, - sanitized_Request, - urlencode_postdata, -) - - -class MinhatecaIE(InfoExtractor): - _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P[0-9]+)\.' - _TEST = { - 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)', - 'info_dict': { - 'id': '125848331', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - 'thumbnail': r're:^https?://.*\.jpg$', - 'filesize_approx': 1530000, - 'duration': 9, - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - token = self._html_search_regex( - r'(.*?)', webpage, 'title') - title, _, ext = title_str.rpartition('.') - filesize_approx = parse_filesize(self._html_search_regex( - r'

(.*?)

', - webpage, 'file size approximation', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'(?s)

.*?class="bold">(.*?)<', - webpage, 'duration', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r'

([0-9]+)

', - webpage, 'view count', fatal=False)) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'ext': ext, - 'filesize_approx': filesize_approx, - 'duration': duration, - 'view_count': view_count, - 'thumbnail': self._og_search_thumbnail(webpage), - } From 9e46d1f8aadd38f6de7c2b921b294e67ed2267eb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 17:15:15 +0100 Subject: [PATCH 08/14] [addanime] remove extractor --- youtube_dl/extractor/addanime.py | 95 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 96 deletions(-) delete mode 100644 youtube_dl/extractor/addanime.py diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py deleted file mode 100644 index 5e7c0724e..000000000 --- a/youtube_dl/extractor/addanime.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - qualities, -) - - -class AddAnimeIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' - _TESTS = [{ - 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', - 'md5': '72954ea10bc979ab5e2eb288b21425a0', - 'info_dict': { - 'id': '24MR3YO5SAS9', - 'ext': 'mp4', - 'description': 'One Piece 606', - 'title': 'One Piece 606', - }, - 'skip': 'Video is gone', - }, { - 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - webpage = self._download_webpage(url, video_id) - except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError) or \ - ee.cause.code != 503: - raise - - redir_webpage = ee.cause.read().decode('utf-8') - action = self._search_regex( - r'
', - redir_webpage, 'redirect vc value') - av = re.search( - r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', - redir_webpage) - if av is None: - raise ExtractorError('Cannot find redirect math task') - av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) - - parsed_url = compat_urllib_parse_urlparse(url) - av_val = av_res + len(parsed_url.netloc) - confirm_url = ( - parsed_url.scheme + '://' + parsed_url.netloc - + action + '?' - + compat_urllib_parse_urlencode({ - 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) - self._download_webpage( - confirm_url, video_id, - note='Confirming after redirect') - webpage = self._download_webpage(url, video_id) - - FORMATS = ('normal', 'hq') - quality = qualities(FORMATS) - formats = [] - for format_id in FORMATS: - rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) - video_url = self._search_regex(rex, webpage, 'video file URLx', - fatal=False) - if not video_url: - continue - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'quality': quality(format_id), - }) - self._sort_formats(formats) - video_title = self._og_search_title(webpage) - video_description = self._og_search_description(webpage) - - return { - '_type': 'video', - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'description': video_description - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dfd0ef198..d96f0d284 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -18,7 +18,6 @@ from .acast import ( ACastIE, ACastChannelIE, ) -from .addanime import AddAnimeIE from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import ( From 433e0710585e2414697cff6d444204e1db950bd7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 10 Nov 2019 17:02:47 +0100 Subject: [PATCH 09/14] [facebook] fix posts video data extraction(closes #22473) --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index c723726b7..ce64e2683 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -334,7 +334,7 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json( self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) From 2e9ad59a4d6dfd82b34a965cfc5b8c5a647d1598 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Nov 2019 09:53:04 +0100 Subject: [PATCH 10/14] [soundcloud] check if the soundtrack has downloads left(closes #23045) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 875b9d887..e8ffb2cbe 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -276,7 +276,7 @@ class SoundcloudIE(InfoExtractor): if secret_token: query['secret_token'] = secret_token - if info.get('downloadable'): + if info.get('downloadable') and info.get('has_downloads_left'): format_url = update_url_query( info.get('download_url') or track_base_url + '/download', query) format_urls.add(format_url) From 48970d5cc8838ac404a64462d175b248401e2bd2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Nov 2019 10:51:54 +0100 Subject: [PATCH 11/14] [teamcoco] add support for new videos(closes #23054) --- youtube_dl/extractor/teamcoco.py | 68 +++++++++++++++++--------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 7640cf00a..5793b711f 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -84,6 +84,19 @@ class TeamcocoIE(TurnerBaseIE): 'only_matching': True, } ] + _RECORD_TEMPL = '''id + title + teaser + publishOn + thumb { + preview + } + tags { + name + } + duration + turnerMediaId + turnerMediaAuthToken''' def _graphql_call(self, query_template, object_type, object_id): find_object = 'find' + object_type @@ -98,36 +111,36 @@ class TeamcocoIE(TurnerBaseIE): display_id = self._match_id(url) response = self._graphql_call('''{ - %s(slug: "%s") { + %%s(slug: "%%s") { ... on RecordSlug { record { + %s + } + } + ... on PageSlug { + child { id - title - teaser - publishOn - thumb { - preview - } - file { - url - } - tags { - name - } - duration - turnerMediaId - turnerMediaAuthToken } } ... on NotFoundSlug { status } } -}''', 'Slug', display_id) +}''' % self._RECORD_TEMPL, 'Slug', display_id) if response.get('status'): raise ExtractorError('This video is no longer available.', expected=True) - record = response['record'] + child = response.get('child') + if child: + record = self._graphql_call('''{ + %%s(id: "%%s") { + ... on Video { + %s + } + } +}''' % self._RECORD_TEMPL, 'Record', child['id']) + else: + record = response['record'] video_id = record['id'] info = { @@ -150,25 +163,21 @@ class TeamcocoIE(TurnerBaseIE): 'accessTokenType': 'jws', })) else: - d = self._download_json( + video_sources = self._download_json( 'https://teamcoco.com/_truman/d/' + video_id, - video_id, fatal=False) or {} - video_sources = d.get('meta') or {} - if not video_sources: - video_sources = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id) or {} + video_id)['meta']['src'] + if isinstance(video_sources, dict): + video_sources = video_sources.values() formats = [] get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in video_sources.get('src', {}).items(): + for src in video_sources: if not isinstance(src, dict): continue src_url = src.get('src') if not src_url: continue + format_id = src.get('label') ext = determine_ext(src_url, mimetype2ext(src.get('type'))) if format_id == 'hls' or ext == 'm3u8': # compat_urllib_parse.urljoin does not work here @@ -190,9 +199,6 @@ class TeamcocoIE(TurnerBaseIE): 'format_id': format_id, 'quality': get_quality(format_id), }) - if not formats: - formats = self._extract_m3u8_formats( - record['file']['url'], video_id, 'mp4', fatal=False) self._sort_formats(formats) info['formats'] = formats From eb22d1b55744b69d5ec3556529868acfba6c217f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Nov 2019 19:09:32 +0100 Subject: [PATCH 12/14] [nexx] Add support for Multi Player JS Setup(closes #23052) --- youtube_dl/extractor/nexx.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index f9aad83c4..586c1b7eb 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -108,7 +108,7 @@ class NexxIE(InfoExtractor): @staticmethod def _extract_domain_id(webpage): mobj = re.search( - r']+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud|cdn\.com)/(?P\d+)', + r']+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P\d+)', webpage) return mobj.group('id') if mobj else None @@ -123,7 +123,7 @@ class NexxIE(InfoExtractor): domain_id = NexxIE._extract_domain_id(webpage) if domain_id: for video_id in re.findall( - r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)', + r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)', webpage): entries.append( 'https://api.nexx.cloud/v3/%s/videos/byid/%s' @@ -410,8 +410,8 @@ class NexxIE(InfoExtractor): class NexxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P[^/?#&]+)' + _TESTS = [{ 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', 'md5': '16746bfc28c42049492385c989b26c4a', 'info_dict': { @@ -420,7 +420,6 @@ class NexxEmbedIE(InfoExtractor): 'title': 'Nervenkitzel Achterbahn', 'alt_title': 'Karussellbauer in Deutschland', 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2761, @@ -431,7 +430,10 @@ class NexxEmbedIE(InfoExtractor): 'format': 'bestvideo', 'skip_download': True, }, - } + }, { + 'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7', + 'only_matching': True, + }] @staticmethod def _extract_urls(webpage): From 5709d661a2509fab0c9f3412239ecbe7a621f45b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Nov 2019 01:45:04 +0700 Subject: [PATCH 13/14] [drtv] Add support for new URL schema (closes #23059) --- youtube_dl/extractor/drtv.py | 57 ++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 218f10209..390e79f8c 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -17,6 +17,7 @@ from ..utils import ( float_or_none, mimetype2ext, str_or_none, + try_get, unified_timestamp, update_url_query, url_or_none, @@ -24,7 +25,14 @@ from ..utils import ( class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + ) + (?P[\da-z_-]+) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['DK'] IE_NAME = 'drtv' @@ -83,6 +91,26 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', + 'info_dict': { + 'id': '00951930010', + 'ext': 'mp4', + 'title': 'Bonderøven (1:8)', + 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', + 'timestamp': 1546542000, + 'upload_date': '20190103', + 'duration': 2576.6, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769', + 'only_matching': True, + }, { + 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,13 +128,32 @@ class DRTVIE(InfoExtractor): webpage, 'video id', default=None) if not video_id: - video_id = compat_urllib_parse_unquote(self._search_regex( + video_id = self._search_regex( r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', - webpage, 'urn')) + webpage, 'urn', default=None) + if video_id: + video_id = compat_urllib_parse_unquote(video_id) + + _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard' + query = {'expanded': 'true'} + + if video_id: + programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) + else: + programcard_url = _PROGRAMCARD_BASE + page = self._parse_json( + self._search_regex( + r'data\s*=\s*({.+?})\s*(?:;| Date: Thu, 14 Nov 2019 06:38:55 +0100 Subject: [PATCH 14/14] [comcarcoff] remove extractor --- youtube_dl/extractor/comcarcoff.py | 74 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 75 deletions(-) delete mode 100644 youtube_dl/extractor/comcarcoff.py diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py deleted file mode 100644 index 588aad0d9..000000000 --- a/youtube_dl/extractor/comcarcoff.py +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_duration, - parse_iso8601, -) - - -class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' - _TESTS = [{ - 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', - 'info_dict': { - 'id': '2494164', - 'ext': 'mp4', - 'upload_date': '20141127', - 'timestamp': 1417107600, - 'duration': 1232, - 'title': 'Happy Thanksgiving Miranda', - 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', - }, - 'params': { - 'skip_download': 'requires ffmpeg', - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - if not display_id: - display_id = 'comediansincarsgettingcoffee.com' - webpage = self._download_webpage(url, display_id) - - full_data = self._parse_json( - self._search_regex( - r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), - display_id)['videoData'] - - display_id = full_data['activeVideo']['video'] - video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id] - - video_id = compat_str(video_data['mediaId']) - title = video_data['title'] - formats = self._extract_m3u8_formats( - video_data['mediaUrl'], video_id, 'mp4') - self._sort_formats(formats) - - thumbnails = [{ - 'url': video_data['images']['thumb'], - }, { - 'url': video_data['images']['poster'], - }] - - timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( - video_data.get('pubDate')) - duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( - video_data.get('duration')) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': timestamp, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - 'season_number': int_or_none(video_data.get('season')), - 'episode_number': int_or_none(video_data.get('episode')), - 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d96f0d284..cf4bb8f20 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -222,7 +222,6 @@ from .comedycentral import ( ComedyCentralTVIE, ToshIE, ) -from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( MmsIE,