From 2f7aa680b79b60d707d7b09818e3ec55748448b2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 6 Jan 2020 14:24:13 +0100 Subject: [PATCH 1/9] [discovery] fix anonymous token extraction(closes #23650) --- youtube_dl/extractor/discovery.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 6a2712cc5..e0139cc86 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -13,8 +13,8 @@ from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): _VALID_URL = r'''(?x)https?:// (?P - (?:(?:www|go)\.)?discovery| - (?:www\.)? + go\.discovery| + www\. (?: investigationdiscovery| discoverylife| @@ -22,8 +22,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): ahctv| destinationamerica| sciencechannel| - tlc| - velocity + tlc )| watch\. (?: @@ -83,7 +82,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): 'authRel': 'authorization', 'client_id': '3020a40c2356a645b4b4', 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, + 'redirectUri': 'https://www.discovery.com/', })['access_token'] headers = self.geo_verification_headers() From 0264903574f78ef5d950081a1afa542f6a063157 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 6 Jan 2020 14:25:23 +0100 Subject: [PATCH 2/9] [scrippsnetworks] add support for www.discovery.com videos --- youtube_dl/extractor/scrippsnetworks.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index afab9591d..36e1b67a9 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -106,7 +106,7 @@ class ScrippsNetworksWatchIE(AWSIE): class ScrippsNetworksIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pcookingchanneltv|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pcookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P\d+)' _TESTS = [{ 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', 'info_dict': { @@ -131,9 +131,13 @@ class ScrippsNetworksIE(InfoExtractor): }, { 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', 'only_matching': True, + }, { + 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', + 'only_matching': True, }] _ACCOUNT_MAP = { 'cookingchanneltv': 2433005105, + 'discovery': 2706091867, 'diynetwork': 2433004575, 'foodnetwork': 2433005105, 'hgtv': 2433004575, From 7bac77413d2fbd7d9c79100ba85b59b08960e6f0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 6 Jan 2020 14:30:02 +0100 Subject: [PATCH 3/9] [scrippsnetworks] correct test case URL --- youtube_dl/extractor/scrippsnetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index 36e1b67a9..b40b4c4af 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -132,7 +132,7 @@ class ScrippsNetworksIE(InfoExtractor): 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', 'only_matching': True, }, { - 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', + 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', 'only_matching': True, }] _ACCOUNT_MAP = { From b2771a28530dab483848a7389616f1b52e96090c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 7 Jan 2020 13:03:32 +0100 Subject: [PATCH 4/9] [dctp] fix format extraction(closes #23656) --- youtube_dl/extractor/dctp.py | 50 +++++++++++++++--------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 04ff214f7..e700f8d86 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -16,10 +16,11 @@ class DctpTvIE(InfoExtractor): _TESTS = [{ # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', + 'md5': '3ffbd1556c3fe210724d7088fad723e3', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', - 'ext': 'flv', + 'ext': 'm4v', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', 'thumbnail': r're:^https?://.*\.jpg$', @@ -27,10 +28,6 @@ class DctpTvIE(InfoExtractor): 'timestamp': 1302172322, 'upload_date': '20110407', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { # 16x9 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', @@ -59,33 +56,26 @@ class DctpTvIE(InfoExtractor): uuid = media['uuid'] title = media['title'] - ratio = '16x9' if media.get('is_wide') else '4x3' - play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) + is_wide = media.get('is_wide') + formats = [] - servers = self._download_json( - 'http://www.dctp.tv/streaming_servers/', display_id, - note='Downloading server list JSON', fatal=False) + def add_formats(suffix): + templ = 'https://%%s/%s_dctp_%s.m4v' % (uuid, suffix) + formats.extend([{ + 'format_id': 'hls-' + suffix, + 'url': templ % 'cdn-segments.dctp.tv' + '/playlist.m3u8', + 'protocol': 'm3u8_native', + }, { + 'format_id': 's3-' + suffix, + 'url': templ % 'completed-media.s3.amazonaws.com', + }, { + 'format_id': 'http-' + suffix, + 'url': templ % 'cdn-media.dctp.tv', + }]) - if servers: - endpoint = next( - server['endpoint'] - for server in servers - if url_or_none(server.get('endpoint')) - and 'cloudfront' in server['endpoint']) - else: - endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' - - app = self._search_regex( - r'^rtmpe?://[^/]+/(?P.*)$', endpoint, 'app') - - formats = [{ - 'url': endpoint, - 'app': app, - 'play_path': play_path, - 'page_url': url, - 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', - 'ext': 'flv', - }] + add_formats('0500_' + ('16x9' if is_wide else '4x3')) + if is_wide: + add_formats('720p') thumbnails = [] images = media.get('images') From 3cb05b86de3887cfd2f5ebf41fedc09ff3ae6ff3 Mon Sep 17 00:00:00 2001 From: Singwai Chan Date: Tue, 7 Jan 2020 07:11:03 -0700 Subject: [PATCH 5/9] [pandatv] Remove extractor (#23630) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/pandatv.py | 99 ------------------------------ 2 files changed, 100 deletions(-) delete mode 100644 youtube_dl/extractor/pandatv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7b05f5410..1cab440f4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -808,7 +808,6 @@ from .packtpub import ( PacktPubIE, PacktPubCourseIE, ) -from .pandatv import PandaTVIE from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py deleted file mode 100644 index 4219802d5..000000000 --- a/youtube_dl/extractor/pandatv.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - qualities, -) - - -class PandaTVIE(InfoExtractor): - IE_DESC = '熊猫TV' - _VALID_URL = r'https?://(?:www\.)?panda\.tv/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://www.panda.tv/66666', - 'info_dict': { - 'id': '66666', - 'title': 're:.+', - 'uploader': '刘杀鸡', - 'ext': 'flv', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Live stream is offline', - }, { - 'url': 'https://www.panda.tv/66666', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - config = self._download_json( - 'https://www.panda.tv/api_room_v2?roomid=%s' % video_id, video_id) - - error_code = config.get('errno', 0) - if error_code != 0: - raise ExtractorError( - '%s returned error %s: %s' - % (self.IE_NAME, error_code, config['errmsg']), - expected=True) - - data = config['data'] - video_info = data['videoinfo'] - - # 2 = live, 3 = offline - if video_info.get('status') != '2': - raise ExtractorError( - 'Live stream is offline', expected=True) - - title = data['roominfo']['name'] - uploader = data.get('hostinfo', {}).get('name') - room_key = video_info['room_key'] - stream_addr = video_info.get( - 'stream_addr', {'OD': '1', 'HD': '1', 'SD': '1'}) - - # Reverse engineered from web player swf - # (http://s6.pdim.gs/static/07153e425f581151.swf at the moment of - # writing). - plflag0, plflag1 = video_info['plflag'].split('_') - plflag0 = int(plflag0) - 1 - if plflag1 == '21': - plflag0 = 10 - plflag1 = '4' - live_panda = 'live_panda' if plflag0 < 1 else '' - - plflag_auth = self._parse_json(video_info['plflag_list'], video_id) - sign = plflag_auth['auth']['sign'] - ts = plflag_auth['auth']['time'] - rid = plflag_auth['auth']['rid'] - - quality_key = qualities(['OD', 'HD', 'SD']) - suffix = ['_small', '_mid', ''] - formats = [] - for k, v in stream_addr.items(): - if v != '1': - continue - quality = quality_key(k) - if quality <= 0: - continue - for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))): - formats.append({ - 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s?sign=%s&ts=%s&rid=%s' - % (pl, plflag1, room_key, live_panda, suffix[quality], ext, sign, ts, rid), - 'format_id': '%s-%s' % (k, ext), - 'quality': quality, - 'source_preference': pref, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': self._live_title(title), - 'uploader': uploader, - 'formats': formats, - 'is_live': True, - } From c88debff5d355cf345837fdd7f869db1ce8b9db3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 8 Jan 2020 10:54:05 +0100 Subject: [PATCH 6/9] [naver] improve extraction - improve geo-restriction handling - extract automatic captions - extract uploader metadata - extract VLive HLS formats --- youtube_dl/extractor/naver.py | 158 +++++++++++++++++++++------------- youtube_dl/extractor/vlive.py | 56 ++---------- 2 files changed, 107 insertions(+), 107 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index bb3d94413..f265fc929 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,68 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( + clean_html, + dict_get, ExtractorError, + get_element_by_class, int_or_none, + try_get, update_url_query, ) -class NaverIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/v/(?P\d+)' +class NaverBaseIE(InfoExtractor): + _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - _TESTS = [{ - 'url': 'http://tv.naver.com/v/81652', - 'info_dict': { - 'id': '81652', - 'ext': 'mp4', - 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', - 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', - 'upload_date': '20130903', - }, - }, { - 'url': 'http://tv.naver.com/v/395837', - 'md5': '638ed4c12012c458fefcddfd01f173cd', - 'info_dict': { - 'id': '395837', - 'ext': 'mp4', - 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', - 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', - 'upload_date': '20150519', - }, - 'skip': 'Georestricted', - }, { - 'url': 'http://tvcast.naver.com/v/81652', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - vid = self._search_regex( - r'videoId["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'video id', fatal=None, group='value') - in_key = self._search_regex( - r'inKey["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'key', default=None, group='value') - - if not vid or not in_key: - error = self._html_search_regex( - r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', - webpage, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - raise ExtractorError('couldn\'t extract vid and key') + def _extract_video_info(self, video_id, vid, key): video_data = self._download_json( 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, video_id, query={ - 'key': in_key, + 'key': key, }) meta = video_data['meta'] title = meta['subject'] formats = [] + get_list = lambda x: try_get(video_data, lambda y: y[x + 's']['list'], list) or [] def extract_formats(streams, stream_type, query={}): for stream in streams: @@ -73,7 +38,7 @@ class NaverIE(InfoExtractor): encoding_option = stream.get('encodingOption', {}) bitrate = stream.get('bitrate', {}) formats.append({ - 'format_id': '%s_%s' % (stream.get('type') or stream_type, encoding_option.get('id') or encoding_option.get('name')), + 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), 'url': stream_url, 'width': int_or_none(encoding_option.get('width')), 'height': int_or_none(encoding_option.get('height')), @@ -83,7 +48,7 @@ class NaverIE(InfoExtractor): 'protocol': 'm3u8_native' if stream_type == 'HLS' else None, }) - extract_formats(video_data.get('videos', {}).get('list', []), 'H264') + extract_formats(get_list('video'), 'H264') for stream_set in video_data.get('streams', []): query = {} for param in stream_set.get('keys', []): @@ -101,28 +66,101 @@ class NaverIE(InfoExtractor): 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) self._sort_formats(formats) + replace_ext = lambda x, y: re.sub(self._CAPTION_EXT_RE, '.' + y, x) + + def get_subs(caption_url): + if re.search(self._CAPTION_EXT_RE, caption_url): + return [{ + 'url': replace_ext(caption_url, 'ttml'), + }, { + 'url': replace_ext(caption_url, 'vtt'), + }] + else: + return [{'url': caption_url}] + + automatic_captions = {} subtitles = {} - for caption in video_data.get('captions', {}).get('list', []): + for caption in get_list('caption'): caption_url = caption.get('source') if not caption_url: continue - subtitles.setdefault(caption.get('language') or caption.get('locale'), []).append({ - 'url': caption_url, - }) + sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles + sub_dict.setdefault(dict_get(caption, ('locale', 'language')), []).extend(get_subs(caption_url)) - upload_date = self._search_regex( - r']+class="date".*?(\d{4}\.\d{2}\.\d{2})', - webpage, 'upload date', fatal=False) - if upload_date: - upload_date = upload_date.replace('.', '') + user = meta.get('user', {}) return { 'id': video_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'description': self._og_search_description(webpage), - 'thumbnail': meta.get('cover', {}).get('source') or self._og_search_thumbnail(webpage), + 'automatic_captions': automatic_captions, + 'thumbnail': try_get(meta, lambda x: x['cover']['source']), 'view_count': int_or_none(meta.get('count')), - 'upload_date': upload_date, + 'uploader_id': user.get('id'), + 'uploader': user.get('name'), + 'uploader_url': user.get('url'), } + + +class NaverIE(NaverBaseIE): + _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P\d+)' + _GEO_BYPASS = False + _TESTS = [{ + 'url': 'http://tv.naver.com/v/81652', + 'info_dict': { + 'id': '81652', + 'ext': 'mp4', + 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', + 'description': '메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + 'upload_date': '20130903', + 'uploader': '메가스터디, 합격불변의 법칙', + 'uploader_id': 'megastudy', + }, + }, { + 'url': 'http://tv.naver.com/v/395837', + 'md5': '8a38e35354d26a17f73f4e90094febd3', + 'info_dict': { + 'id': '395837', + 'ext': 'mp4', + 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', + 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', + 'upload_date': '20150519', + 'uploader': '4가지쇼 시즌2', + 'uploader_id': 'wrappinguser29', + }, + 'skip': 'Georestricted', + }, { + 'url': 'http://tvcast.naver.com/v/81652', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + content = self._download_json( + 'https://tv.naver.com/api/contents/json/v/' + video_id, + video_id, headers=self.geo_verification_headers()) + player_json = content.get('playerJson') or {} + + vid = player_json.get('videoId') + in_key = player_json.get('inKey') + + if not vid or not in_key: + player_auth = player_json.get('playerAuth') + if player_auth == 'notCountry': + self.raise_geo_restricted(countries=['KR']) + elif player_auth == 'notLogin': + self.raise_login_required() + raise ExtractorError('couldn\'t extract vid and key') + info = self._extract_video_info(video_id, vid, in_key) + + clip_info_html = content.get('clipInfoHtml') + if clip_info_html: + info['description'] = clean_html(get_element_by_class('desc', clip_info_html)) + upload_date = self._search_regex( + r']+class="date".*?(\d{4}\.\d{2}\.\d{2})', + clip_info_html, 'upload date', fatal=False) + if upload_date: + info['upload_date'] = upload_date.replace('.', '') + + return info diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index c3429f723..f79531e6f 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -6,22 +6,18 @@ import time import itertools from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_str, -) +from .naver import NaverBaseIE +from ..compat import compat_str from ..utils import ( - dict_get, ExtractorError, - float_or_none, - int_or_none, + merge_dicts, remove_start, try_get, urlencode_postdata, ) -class VLiveIE(InfoExtractor): +class VLiveIE(NaverBaseIE): IE_NAME = 'vlive' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)' _NETRC_MACHINE = 'vlive' @@ -34,6 +30,7 @@ class VLiveIE(InfoExtractor): 'title': "[V LIVE] Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, + 'uploader_id': 'muploader_a', }, }, { 'url': 'http://www.vlive.tv/video/16937', @@ -44,6 +41,7 @@ class VLiveIE(InfoExtractor): 'creator': 'EXO', 'view_count': int, 'subtitles': 'mincount:12', + 'uploader_id': 'muploader_j', }, 'params': { 'skip_download': True, @@ -187,45 +185,9 @@ class VLiveIE(InfoExtractor): 'This video is only available for CH+ subscribers') long_video_id, key = video_info['vid'], video_info['inkey'] - playinfo = self._download_json( - 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s' - % compat_urllib_parse_urlencode({ - 'videoId': long_video_id, - 'key': key, - 'ptc': 'http', - 'doct': 'json', # document type (xml or json) - 'cpt': 'vtt', # captions type (vtt or ttml) - }), video_id) - - formats = [{ - 'url': vid['source'], - 'format_id': vid.get('encodingOption', {}).get('name'), - 'abr': float_or_none(vid.get('bitrate', {}).get('audio')), - 'vbr': float_or_none(vid.get('bitrate', {}).get('video')), - 'width': int_or_none(vid.get('encodingOption', {}).get('width')), - 'height': int_or_none(vid.get('encodingOption', {}).get('height')), - 'filesize': int_or_none(vid.get('size')), - } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')] - self._sort_formats(formats) - - view_count = int_or_none(playinfo.get('meta', {}).get('count')) - - subtitles = {} - for caption in playinfo.get('captions', {}).get('list', []): - lang = dict_get(caption, ('locale', 'language', 'country', 'label')) - if lang and caption.get('source'): - subtitles[lang] = [{ - 'ext': 'vtt', - 'url': caption['source']}] - - info = self._get_common_fields(webpage) - info.update({ - 'id': video_id, - 'formats': formats, - 'view_count': view_count, - 'subtitles': subtitles, - }) - return info + return merge_dicts( + self._get_common_fields(webpage), + self._extract_video_info(video_id, long_video_id, key)) def _download_init_page(self, video_id): return self._download_webpage( From 838171630da3691ad4df8a11eeab9b2632fb0bcf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 8 Jan 2020 12:55:33 +0100 Subject: [PATCH 7/9] [naver] improve metadata extraction --- youtube_dl/extractor/naver.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index f265fc929..61fc59126 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -8,8 +8,8 @@ from ..utils import ( clean_html, dict_get, ExtractorError, - get_element_by_class, int_or_none, + parse_duration, try_get, update_url_query, ) @@ -113,6 +113,7 @@ class NaverIE(NaverBaseIE): 'ext': 'mp4', 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', 'description': '메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + 'timestamp': 1378200754, 'upload_date': '20130903', 'uploader': '메가스터디, 합격불변의 법칙', 'uploader_id': 'megastudy', @@ -125,6 +126,7 @@ class NaverIE(NaverBaseIE): 'ext': 'mp4', 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', + 'timestamp': 1432030253, 'upload_date': '20150519', 'uploader': '4가지쇼 시즌2', 'uploader_id': 'wrappinguser29', @@ -138,29 +140,27 @@ class NaverIE(NaverBaseIE): def _real_extract(self, url): video_id = self._match_id(url) content = self._download_json( - 'https://tv.naver.com/api/contents/json/v/' + video_id, + 'https://tv.naver.com/api/json/v/' + video_id, video_id, headers=self.geo_verification_headers()) - player_json = content.get('playerJson') or {} + player_info_json = content.get('playerInfoJson') or {} + current_clip = player_info_json.get('currentClip') or {} - vid = player_json.get('videoId') - in_key = player_json.get('inKey') + vid = current_clip.get('videoId') + in_key = current_clip.get('inKey') if not vid or not in_key: - player_auth = player_json.get('playerAuth') + player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth']) if player_auth == 'notCountry': self.raise_geo_restricted(countries=['KR']) elif player_auth == 'notLogin': self.raise_login_required() raise ExtractorError('couldn\'t extract vid and key') info = self._extract_video_info(video_id, vid, in_key) - - clip_info_html = content.get('clipInfoHtml') - if clip_info_html: - info['description'] = clean_html(get_element_by_class('desc', clip_info_html)) - upload_date = self._search_regex( - r']+class="date".*?(\d{4}\.\d{2}\.\d{2})', - clip_info_html, 'upload date', fatal=False) - if upload_date: - info['upload_date'] = upload_date.replace('.', '') - + info.update({ + 'description': clean_html(current_clip.get('description')), + 'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000), + 'duration': parse_duration(current_clip.get('displayPlayTime')), + 'like_count': int_or_none(current_clip.get('recommendPoint')), + 'age_limit': 19 if current_clip.get('adult') else None, + }) return info From a71c1d1a5a54afc7f24acf3af7f1afd610c648f2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 8 Jan 2020 22:42:53 +0100 Subject: [PATCH 8/9] [cloudflarestream] improve extraction - add support for bytehighway.net domain - add support for signed URLs - extract thumbnail --- youtube_dl/extractor/cloudflarestream.py | 25 +++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py index 8ff2c6531..9026c7c90 100644 --- a/youtube_dl/extractor/cloudflarestream.py +++ b/youtube_dl/extractor/cloudflarestream.py @@ -1,20 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor class CloudflareStreamIE(InfoExtractor): + _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' _VALID_URL = r'''(?x) https?:// (?: - (?:watch\.)?(?:cloudflarestream\.com|videodelivery\.net)/| - embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo= + (?:watch\.)?%s/| + embed\.%s/embed/[^/]+\.js\?.*?\bvideo= ) - (?P[\da-f]+) - ''' + (?P[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+) + ''' % (_DOMAIN_RE, _DOMAIN_RE) _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { @@ -46,18 +48,23 @@ class CloudflareStreamIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' + base_url = 'https://%s/%s/' % (domain, video_id) + if '.' in video_id: + video_id = self._parse_json(base64.urlsafe_b64decode( + video_id.split('.')[1]), video_id)['sub'] + manifest_base_url = base_url + 'manifest/video.' formats = self._extract_m3u8_formats( - 'https://cloudflarestream.com/%s/manifest/video.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False) + manifest_base_url + 'm3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( - 'https://cloudflarestream.com/%s/manifest/video.mpd' % video_id, - video_id, mpd_id='dash', fatal=False)) + manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'title': video_id, + 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', 'formats': formats, } From 483b858d49eabaad2c521425eb892c1330d4f525 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 8 Jan 2020 23:07:41 +0100 Subject: [PATCH 9/9] [cloudflarestream] import embed URL extraction --- youtube_dl/extractor/cloudflarestream.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py index 9026c7c90..2fdcfbb3a 100644 --- a/youtube_dl/extractor/cloudflarestream.py +++ b/youtube_dl/extractor/cloudflarestream.py @@ -9,14 +9,16 @@ from .common import InfoExtractor class CloudflareStreamIE(InfoExtractor): _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' + _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE + _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' _VALID_URL = r'''(?x) https?:// (?: (?:watch\.)?%s/| - embed\.%s/embed/[^/]+\.js\?.*?\bvideo= + %s ) - (?P[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+) - ''' % (_DOMAIN_RE, _DOMAIN_RE) + (?P%s) + ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { @@ -43,7 +45,7 @@ class CloudflareStreamIE(InfoExtractor): return [ mobj.group('url') for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', + r']+\bsrc=(["\'])(?P(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE), webpage)] def _real_extract(self, url):