From 87f89dacddfa46399aea9252ca078f5f386dce38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jun 2018 02:55:20 +0700 Subject: [PATCH 001/111] [pbs] Improve extraction (closes #16623, closes #16684) --- youtube_dl/extractor/pbs.py | 57 ++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8d6f2dd3d..52ab2f158 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -375,6 +376,35 @@ class PBSIE(InfoExtractor): }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -438,6 +468,7 @@ class PBSIE(InfoExtractor): r'', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r']+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r']+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -472,7 +503,8 @@ class PBSIE(InfoExtractor): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -482,13 +514,27 @@ class PBSIE(InfoExtractor): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r' Date: Sat, 16 Jun 2018 00:08:44 +0200 Subject: [PATCH 002/111] [vidzi] Fix extraction (closes #16678) --- youtube_dl/extractor/vidzi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9026e778c..d70283479 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -54,7 +54,8 @@ class VidziIE(InfoExtractor): self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=js_to_json) + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) if jwplayer_data: break From 734d461ca04a9f271dd463aa75d44ac82377057e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jun 2018 21:14:36 +0700 Subject: [PATCH 003/111] [expressen] Add extractor --- youtube_dl/extractor/expressen.py | 77 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 78 insertions(+) create mode 100644 youtube_dl/extractor/expressen.py diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py new file mode 100644 index 000000000..f61178012 --- /dev/null +++ b/youtube_dl/extractor/expressen.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, + unified_timestamp, +) + + +class ExpressenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', + 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'info_dict': { + 'id': '8690962', + 'ext': 'mp4', + 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', + 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 788, + 'timestamp': 1526639109, + 'upload_date': '20180518', + }, + }, { + 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + def extract_data(name): + return self._parse_json( + self._search_regex( + r'data-%s=(["\'])(?P(?:(?!\1).)+)\1' % name, + webpage, 'info', group='value'), + display_id, transform_source=unescapeHTML) + + info = extract_data('video-tracking-info') + video_id = info['videoId'] + + data = extract_data('article-data') + stream = data['stream'] + + if determine_ext(stream) == 'm3u8': + formats = self._extract_m3u8_formats( + stream, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{ + 'url': stream, + }] + self._sort_formats(formats) + + title = info.get('titleRaw') or data['title'] + description = info.get('descriptionRaw') + thumbnail = info.get('socialMediaImage') or data.get('image') + duration = int_or_none(info.get('videoTotalSecondsDuration') or + data.get('totalSecondsDuration')) + timestamp = unified_timestamp(info.get('publishDate')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d4583b8e4..c3e6daa24 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -335,6 +335,7 @@ from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE +from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE from .facebook import ( From 764cd4e6f3450997eb0499b68b17b580a5e074f3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jun 2018 02:43:24 +0100 Subject: [PATCH 004/111] [rtbf] improve extraction - add support for audio and live streams(closes #11923)(closes #9638) - extract HLS, DASH and all HTTP formats - extract subtitles - fixup specific http urls(fixes #16101) --- youtube_dl/extractor/rtbf.py | 127 ++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 28cc5522d..acff9766a 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - int_or_none, ExtractorError, + float_or_none, + int_or_none, + strip_or_none, ) @@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor): (?: video/[^?]+\?.*\bid=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*id= + auvio/[^/]+\?.*\b(?Pl)?id= )(?P\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', + 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', - 'duration': 3099, + 'description': '(du 25/04/2014)', + 'duration': 3099.54, 'upload_date': '20140425', - 'timestamp': 1398456336, - 'uploader': 'rtbfsport', + 'timestamp': 1398456300, } }, { # geo restricted @@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor): }, { 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, }] _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _PROVIDERS = { @@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) error = data.get('error') if error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - data = data['data'] - provider = data.get('provider') if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' formats = [] - for key, format_id in self._QUALITIES: - format_url = data.get(key + 'Url') - if format_url: + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats.copy(): + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': fix_url(format_url), + 'height': height, }) - thumbnails = [] - for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): - if thumbnail_id != 'default': - thumbnails.append({ - 'url': self._IMAGE_HOST + thumbnail_url, - 'id': thumbnail_id, - }) + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) return { - 'id': video_id, + 'id': media_id, 'formats': formats, - 'title': data['title'], - 'description': data.get('description') or data.get('subtitle'), - 'thumbnails': thumbnails, - 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': int_or_none(data.get('created')), - 'view_count': int_or_none(data.get('viewCount')), - 'uploader': data.get('channel'), - 'tags': data.get('tags'), + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, } From 18825117545690499dc7064cd5ba207ca5ca3e23 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jun 2018 12:01:14 +0100 Subject: [PATCH 005/111] [6play] add support for rtlplay.be and extract hd usp formats --- youtube_dl/extractor/sixplay.py | 43 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 69951e387..1f8469a90 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -19,29 +19,33 @@ from ..utils import ( class SixPlayIE(InfoExtractor): IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P[0-9]+)' - _TEST = { - 'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450', - 'md5': '42310bffe4ba3982db112b9cd3467328', + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P6play\.fr|rtlplay.be)/.+?-c_)(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', 'info_dict': { - 'id': '11638450', + 'id': '12041051', 'ext': 'mp4', - 'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6', - 'description': 'md5:308853f6a5f9e2d55a30fc0654de415f', - 'duration': 39, - 'series': 'Le meilleur pâtissier', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', }, - 'params': { - 'skip_download': True, - }, - } + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + }.get(domain, ('6play', 'm6web')) data = self._download_json( - 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id, - video_id, query={ + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ 'csa': 5, 'with': 'clips', }) @@ -65,7 +69,12 @@ class SixPlayIE(InfoExtractor): subtitles.setdefault('fr', []).append({'url': asset_url}) continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage(asset_url, video_id, fatal=False) + if not urlh: + continue + asset_url = urlh.geturl() asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', From 8b183bd5f800792cfc37da8ef2383fb5ba88195c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jun 2018 15:53:29 +0100 Subject: [PATCH 006/111] [tf1] try all supported adaptive urls --- youtube_dl/extractor/tf1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e595c4a69..903f47380 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -19,6 +19,7 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { From 0adf213d8cce21e1a6ca6be7df532d67d184fbe2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jun 2018 15:56:52 +0100 Subject: [PATCH 007/111] [wat] try all supported adaptive urls --- youtube_dl/extractor/wat.py | 41 +++++++++++++++---------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 20fef1f04..8ef3e0906 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -19,7 +19,6 @@ class WatIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': '83d882d9de5c9d97f0bb2c6273cde56a', 'info_dict': { 'id': '11713067', 'ext': 'mp4', @@ -28,10 +27,15 @@ class WatIE(InfoExtractor): 'upload_date': '20140819', 'duration': 120, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', 'info_dict': { 'id': '11713075', 'ext': 'mp4', @@ -98,38 +102,25 @@ class WatIE(InfoExtractor): formats = [] try: + alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id) m3u8_url = manifest_urls.get('hls') if m3u8_url: m3u8_url = remove_bitrate_limit(m3u8_url) - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + for m3u8_alt_url in alt_urls(m3u8_url): + formats.extend(self._extract_m3u8_formats( + m3u8_alt_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - http_url = extract_url('android5/%s.mp4', 'http') - if http_url: - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - format_id = m3u8_format['format_id'].replace('hls', 'http') - fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) - if self._is_valid_url(fmt_url, video_id, format_id): - f = m3u8_format.copy() - f.update({ - 'url': fmt_url, - 'format_id': format_id, - 'protocol': 'http', - }) - formats.append(f) mpd_url = manifest_urls.get('mpd') if mpd_url: - formats.extend(self._extract_mpd_formats(remove_bitrate_limit( - mpd_url), video_id, mpd_id='dash', fatal=False)) + mpd_url = remove_bitrate_limit(mpd_url) + for mpd_alt_url in alt_urls(mpd_url): + formats.extend(self._extract_mpd_formats( + mpd_alt_url, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) except ExtractorError: abr = 64 From ce0edda0f9c0d8cf6250edfa7a43ddbccd101cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 00:49:50 +0700 Subject: [PATCH 008/111] [markiza] Add extractors (closes #16750) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/markiza.py | 121 +++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 youtube_dl/extractor/markiza.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c3e6daa24..3b3964c01 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -590,6 +590,10 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py new file mode 100644 index 000000000..e6bfab114 --- /dev/null +++ b/youtube_dl/extractor/markiza.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + orderedSet, + parse_duration, + try_get, +) + + +class MarkizaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P\d+)(?:[_/]|$)' + _TESTS = [{ + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139078', + 'ext': 'mp4', + 'title': 'Oteckovia 109', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2760, + }, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny', + 'info_dict': { + 'id': '85430', + 'title': 'Televízne noviny', + }, + 'playlist_count': 23, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/embed/85295', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://videoarchiv.markiza.sk/json/video_jwplayer7.json', + video_id, query={'id': video_id}) + + info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash') + + if info.get('_type') == 'playlist': + info.update({ + 'id': video_id, + 'title': try_get( + data, lambda x: x['details']['name'], compat_str), + }) + else: + info['duration'] = parse_duration( + try_get(data, lambda x: x['details']['duration'], compat_str)) + return info + + +class MarkizaPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P\d+)_' + _TESTS = [{ + 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139355', + 'ext': 'mp4', + 'title': 'Oteckovia 110', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2604, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas', + 'only_matching': True, + }, { + 'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach', + 'only_matching': True, + }, { + 'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili', + 'only_matching': True, + }, { + 'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba', + 'only_matching': True, + }, { + 'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) + for video_id in orderedSet(re.findall( + r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)', + webpage))] + + return self.playlist_result(entries, playlist_id) From 9e761fe6f555a3ad0b92bdc2c651a4c5b8aff887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 01:31:49 +0700 Subject: [PATCH 009/111] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 062000594..38cfcd8fd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + version 2018.06.14 Core From 858cf4dc2966d398d939cedffc160afad2484f8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 01:34:36 +0700 Subject: [PATCH 010/111] release 2018.06.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1cfb54bfd..de3888214 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.14 +[debug] youtube-dl version 2018.06.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 38cfcd8fd..fe5087097 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.06.18 Core * [downloader/rtmp] Fix downloading in verbose mode (#16736) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 705279ac1..432a7ba93 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -266,6 +266,7 @@ - **Europa** - **EveryonesMixtape** - **ExpoTV** + - **Expressen** - **ExtremeTube** - **EyedoTV** - **facebook** @@ -455,6 +456,8 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **Markiza** + - **MarkizaPage** - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1533dceb4..49fef60ea 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.14' +__version__ = '2018.06.18' From 8ba84e4600229c9baec6410b0c0c9e500c0105b5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jun 2018 20:40:35 +0100 Subject: [PATCH 011/111] [tvnow] try all clear manifest urls(closes #15361) --- youtube_dl/extractor/tvnow.py | 53 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 808571ece..60937616f 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -19,8 +19,8 @@ class TVNowBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -31,27 +31,42 @@ class TVNowBaseIE(InfoExtractor): video_id = compat_str(info['id']) title = info['title'] - mpd_url = info['manifest']['dashclear'] - if not mpd_url: + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: if info.get('isDrm'): raise ExtractorError( 'Video %s is DRM protected' % video_id, expected=True) if info.get('geoblocked'): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) + raise self.raise_geo_restricted() if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - - mpd_url = update_url_query(mpd_url, {'filter': ''}) - formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) description = info.get('articleLong') or info.get('articleShort') @@ -88,7 +103,7 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?:www\.)?tvnow\.(?:de|at|ch)/(?P[^/]+)/ (?P[^/]+)/ (?!(?:list|jahr)(?:/|$))(?P[^/?\#&]+) ''' @@ -140,11 +155,13 @@ class TVNowIE(TVNowBaseIE): }] def _real_extract(self, url): - display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + display_id = '%s/%s' % mobj.group(2, 3) info = self._call_api( 'movies/' + display_id, display_id, query={ 'fields': ','.join(self._VIDEO_FIELDS), + 'station': mobj.group(1), }) return self._extract_video(info, display_id) From 075a13d3e9e860f0033ea5a37795bebba02690b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 03:22:08 +0700 Subject: [PATCH 012/111] [compat] Introduce compat_integer_types --- youtube_dl/compat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4a611f183..7b770340f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2787,6 +2787,12 @@ except NameError: # Python 3 compat_numeric_types = (int, float, complex) +try: + compat_integer_types = (int, long) +except NameError: # Python 3 + compat_integer_types = (int, ) + + if sys.version_info < (2, 7): def compat_socket_create_connection(address, timeout, source_address=None): host, port = address @@ -2974,6 +2980,7 @@ __all__ = [ 'compat_http_client', 'compat_http_server', 'compat_input', + 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', 'compat_numeric_types', From d391b7e23d3d6c2af03c6329b4bf059ec095f33d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 04:01:48 +0700 Subject: [PATCH 013/111] [extractor/common] Introduce expected_status for convenient accept of failed HTTP requests Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response. --- youtube_dl/extractor/common.py | 135 +++++++++++++++++++++++++++------ 1 file changed, 113 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a2548dba3..394f34372 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -19,6 +19,7 @@ from ..compat import ( compat_cookies, compat_etree_fromstring, compat_getpass, + compat_integer_types, compat_http_client, compat_os_name, compat_str, @@ -548,8 +549,26 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - """ Returns the response handle """ + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ if note is None: self.report_download_webpage(video_id) elif note is not False: @@ -578,6 +597,10 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + return err.fp + if errnote is False: return False if errnote is None: @@ -590,13 +613,17 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): - """ Returns a tuple (page content as string, URL handle) """ + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) if urlh is False: assert not fatal return False @@ -685,13 +712,52 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): - """ Returns the data of the page as a string """ + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=5, encoding=None, data=None, + headers={}, query={}, expected_status=None): + """ + Return the data of the page as a string. + + Arguments: + url_or_request -- plain text URL as a string or + a compat_urllib_request.Requestobject + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + tries -- number of tries + timeout -- sleep interval between tries + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -707,11 +773,17 @@ class InfoExtractor(object): def _download_xml_handle( self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res xml_string, urlh = res @@ -719,15 +791,21 @@ class InfoExtractor(object): xml_string, video_id, transform_source=transform_source, fatal=fatal), urlh - def _download_xml(self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}): - """Return the xml as an xml.etree.ElementTree.Element""" + def _download_xml( + self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}, expected_status=None): + """ + Return the xml as an xml.etree.ElementTree.Element. + + See _download_webpage docstring for arguments specification. + """ res = self._download_xml_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): @@ -745,11 +823,17 @@ class InfoExtractor(object): def _download_json_handle( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (JSON object, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res json_string, urlh = res @@ -760,11 +844,18 @@ class InfoExtractor(object): def _download_json( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ res = self._download_json_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): From 00a429bea3c2deacef5dbfb2b0b7e191b1dbaf62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 04:04:13 +0700 Subject: [PATCH 014/111] [markiza] Expect 500 status code --- youtube_dl/extractor/markiza.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py index e6bfab114..def960a0c 100644 --- a/youtube_dl/extractor/markiza.py +++ b/youtube_dl/extractor/markiza.py @@ -110,7 +110,11 @@ class MarkizaPageIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage( + # Downloading for some hosts (e.g. dajto, doma) fails with 500 + # although everything seems to be OK, so considering 500 + # status code to be expected. + url, playlist_id, expected_status=500) entries = [ self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) From 9283d4ea03f907f2b9e7954b0897075a165b4d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 04:04:47 +0700 Subject: [PATCH 015/111] [bbccouk] Use expected_status --- youtube_dl/extractor/bbc.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 30a63a24e..293d82b0f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -21,7 +21,6 @@ from ..utils import ( urljoin, ) from ..compat import ( - compat_etree_fromstring, compat_HTTPError, compat_urlparse, ) @@ -334,14 +333,9 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - try: - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): - media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) - else: - raise + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): From 721a877d2fb82de18e4aeec27d70f84f9b41f766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 23:08:35 +0700 Subject: [PATCH 016/111] [vgtv] Add support for www.aftonbladet.se/tv/ URLs --- youtube_dl/extractor/vgtv.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index c21a09c01..d430e2944 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -24,6 +24,7 @@ class VGTVIE(XstreamIE): 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -44,7 +45,7 @@ class VGTVIE(XstreamIE): (?: (?:\#!/)?(?:video|live)/| embed?.*id=| - articles/ + a(?:rticles)?/ )| (?P %s @@ -143,6 +144,10 @@ class VGTVIE(XstreamIE): 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, { 'url': 'abtv:140026', 'only_matching': True, From 713afa705c228c2caa6054fff19a7690ba19d64a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Jun 2018 23:15:38 +0700 Subject: [PATCH 017/111] [vgtv] Improve HLS formats extraction --- youtube_dl/extractor/vgtv.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index d430e2944..fe7a26b62 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -183,13 +183,15 @@ class VGTVIE(XstreamIE): streams = data['streamUrls'] stream_type = data.get('streamType') - + is_live = stream_type == 'live' formats = [] hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -234,13 +236,13 @@ class VGTVIE(XstreamIE): info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], + 'title': self._live_title(data['title']) if is_live else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'is_live': True if stream_type == 'live' else False, + 'is_live': is_live, }) return info From 18806e3b6b95d03c773c89e465e1b28b2f12a618 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 18 Jun 2018 19:08:54 +0100 Subject: [PATCH 018/111] [rtbf] fix extraction for python 3.2 and older --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index acff9766a..3b0f3080b 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -99,7 +99,7 @@ class RTBFIE(InfoExtractor): http_url = data.get('url') if formats and http_url and re.search(height_re, http_url): http_url = fix_url(http_url) - for m3u8_f in formats.copy(): + for m3u8_f in formats[:]: height = m3u8_f.get('height') if not height: continue From e12b4b8bccd364cb5cc68aab4888209965a82dc1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jun 2018 10:35:42 +0100 Subject: [PATCH 019/111] [6play] use geo verfication headers --- youtube_dl/extractor/sixplay.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 1f8469a90..a363221bc 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -71,7 +71,9 @@ class SixPlayIE(InfoExtractor): if container == 'm3u8' or ext == 'm3u8': if protocol == 'usp': if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: - urlh = self._request_webpage(asset_url, video_id, fatal=False) + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) if not urlh: continue asset_url = urlh.geturl() From 8b4b400aef83b233502ece7321ee84f6ab9e213e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Jun 2018 23:00:36 +0700 Subject: [PATCH 020/111] [peertube] Improve generic support (closes #16733) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/peertube.py | 47 +++++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dad951b75..6c0f772ac 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3076,7 +3076,7 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - peertube_urls = PeerTubeIE._extract_urls(webpage) + peertube_urls = PeerTubeIE._extract_urls(webpage, url) if peertube_urls: return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index a481b3151..d9849a2ba 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -116,12 +116,14 @@ class PeerTubeIE(InfoExtractor): videos\.tcit\.fr| peertube\.cpy\.re )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _VALID_URL = r'''(?x) - https?:// - %s - /(?:videos/(?:watch|embed)|api/v\d/videos)/ - (?P[^/?\#&]+) - ''' % _INSTANCES_RE + (?: + peertube:(?P[^:]+):| + https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', @@ -157,21 +159,40 @@ class PeerTubeIE(InfoExtractor): }, { 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, }] @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'''(?x)]+\bsrc=(["\'])(?P(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' - % PeerTubeIE._INSTANCES_RE, webpage)] + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P[^/]+)/videos/watch/(?P%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') video = self._download_json( - urljoin(url, '/api/v1/videos/%s' % video_id), video_id) + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) title = video['name'] From e73050882763705ccb8e487edbc3983b5582b1a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:12:53 +0700 Subject: [PATCH 021/111] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index fe5087097..1494081b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + version 2018.06.18 Core From c9b983ff827aae25a0fe2116c98c26702c581b81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:16:04 +0700 Subject: [PATCH 022/111] release 2018.06.19 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index de3888214..d254678b5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.19*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.19** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.18 +[debug] youtube-dl version 2018.06.19 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 1494081b8..93dc40d8c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.19 Core + [extractor/common] Introduce expected_status in _download_* methods diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 49fef60ea..dd4795cd1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.18' +__version__ = '2018.06.19' From f51f526b0acb5943332452d1958581cb1135bfe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 20 Jun 2018 23:51:14 +0700 Subject: [PATCH 023/111] [foxnews] Add support for iframe embeds (closes #15810, closes #16711) --- youtube_dl/extractor/foxnews.py | 42 +++++++++++++++++++++++++++------ youtube_dl/extractor/generic.py | 6 +++++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index dc0662f74..4c402806a 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -58,6 +58,14 @@ class FoxNewsIE(AMPIE): }, ] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', + webpage)] + def _real_extract(self, url): host, video_id = re.match(self._VALID_URL, url).groups() @@ -71,18 +79,35 @@ class FoxNewsArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' - _TEST = { + _TESTS = [{ + # data-video-id 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'md5': '83d44e1aff1433e7a29a7b537d1700b5', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', 'description': 'Veterans react on \'The Kelly File\'', - 'timestamp': 1473299755, + 'timestamp': 1473301045, 'upload_date': '20160908', }, - } + }, { + # iframe embed + 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'info_dict': { + 'id': '5748266721001', + 'ext': 'flv', + 'title': 'Kyle Kashuv has a positive message for the Trump White House', + 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 229, + 'timestamp': 1520594670, + 'upload_date': '20180309', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -90,10 +115,13 @@ class FoxNewsArticleIE(InfoExtractor): video_id = self._html_search_regex( r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', - webpage, 'video ID', group='id') + webpage, 'video ID', group='id', default=None) + if video_id: + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) + return self.url_result( - 'http://video.foxnews.com/v/' + video_id, - FoxNewsIE.ie_key()) + FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) class FoxNewsInsiderIE(InfoExtractor): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6c0f772ac..d71cb9050 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -111,6 +111,7 @@ from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE from .apa import APAIE +from .foxnews import FoxNewsIE class GenericIE(InfoExtractor): @@ -3091,6 +3092,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( apa_urls, video_id, video_title, ie=APAIE.ie_key()) + foxnews_urls = FoxNewsIE._extract_urls(webpage) + if foxnews_urls: + return self.playlist_from_matches( + foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From 91aa502d916fd3f103d34f927748767413f1d1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 20 Jun 2018 23:59:37 +0700 Subject: [PATCH 024/111] [foxnews:insider] Remove extractor (#15810) Now covered by foxnews:article --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/foxnews.py | 49 +++--------------------------- 2 files changed, 4 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3b3964c01..27ece3b53 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -373,7 +373,6 @@ from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, FoxNewsArticleIE, - FoxNewsInsiderIE, ) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 4c402806a..63613cb85 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -76,7 +76,7 @@ class FoxNewsIE(AMPIE): class FoxNewsArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' _TESTS = [{ @@ -107,6 +107,9 @@ class FoxNewsArticleIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'only_matching': True, }] def _real_extract(self, url): @@ -122,47 +125,3 @@ class FoxNewsArticleIE(InfoExtractor): return self.url_result( FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) - - -class FoxNewsInsiderIE(InfoExtractor): - _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)' - IE_NAME = 'foxnews:insider' - - _TEST = { - 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', - 'md5': 'a10c755e582d28120c62749b4feb4c0c', - 'info_dict': { - 'id': '5099377331001', - 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', - 'ext': 'mp4', - 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', - 'description': 'Is campus censorship getting out of control?', - 'timestamp': 1472168725, - 'upload_date': '20160825', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': [FoxNewsIE.ie_key()], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - - return { - '_type': 'url_transparent', - 'ie_key': FoxNewsIE.ie_key(), - 'url': embed_url, - 'display_id': display_id, - 'title': title, - 'description': description, - } From 30374f4d40d8c993bf92c5af9b9c073da49fe8b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:06:58 +0700 Subject: [PATCH 025/111] [itv] Make SOAP request non fatal and extract metadata from a webpage (closes #16780) --- youtube_dl/extractor/itv.py | 126 ++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 6a4f8a505..40cffed46 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -18,6 +18,7 @@ from ..utils import ( xpath_element, xpath_text, int_or_none, + merge_dicts, parse_duration, smuggle_url, ExtractorError, @@ -129,64 +130,65 @@ class ITVIE(InfoExtractor): resp_env = self._download_xml( params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env)) - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) - else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), - }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + headers=headers, data=etree.tostring(req_env), fatal=False) + if resp_env: + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_code = xpath_text(resp_env, './/faultcode') + fault_string = xpath_text(resp_env, './/faultstring') + if fault_code == 'InvalidGeoRegion': + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') @@ -261,7 +263,17 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, }) - return info + + webpage_info = self._search_json_ld(webpage, video_id, default={}) + if not webpage_info.get('title'): + webpage_info['title'] = self._html_search_regex( + r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or webpage_info['episode'] + + return merge_dicts(info, webpage_info) class ITVBTCCIE(InfoExtractor): From a4ec45179e554e9b24e32c3c06908804b42a5a9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:12:40 +0700 Subject: [PATCH 026/111] [itv] Sort imports --- youtube_dl/extractor/itv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 40cffed46..d05a7b68d 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -13,16 +13,16 @@ from ..compat import ( compat_etree_register_namespace, ) from ..utils import ( + determine_ext, + ExtractorError, extract_attributes, - xpath_with_ns, - xpath_element, - xpath_text, int_or_none, merge_dicts, parse_duration, smuggle_url, - ExtractorError, - determine_ext, + xpath_with_ns, + xpath_element, + xpath_text, ) From b71cc719103c45365244334a4c481f88cd3534fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:38:32 +0700 Subject: [PATCH 027/111] [motherless] Fix extraction (closes #16786) --- youtube_dl/extractor/motherless.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index e24396e79..f191310e1 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -77,8 +77,11 @@ class MotherlessIE(InfoExtractor): title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - video_url = self._html_search_regex( - r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') + video_url = (self._html_search_regex( + (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), + webpage, 'video URL', default=None, group='url') or + 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( r'<strong>Views</strong>\s+([^<]+)<', From 9fb62e35f6e7d865a73cc310f24ccfa0700e5e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:39:13 +0700 Subject: [PATCH 028/111] [motherless:group] Fix _VALID_URL --- youtube_dl/extractor/motherless.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index f191310e1..bed5645f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -123,7 +123,7 @@ class MotherlessIE(InfoExtractor): class MotherlessGroupIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://motherless.com/g/movie_scenes', 'info_dict': { From 74caf528bc822738dffe231df86ed399fc97a38a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 24 Jun 2018 12:02:16 +0100 Subject: [PATCH 029/111] [brightcove] workaround sonyliv DRM protected videos(closes #16807) --- youtube_dl/extractor/brightcove.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ab62e54d6..14f9a14ed 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -572,7 +572,8 @@ class BrightcoveNewIE(AdobePassIE): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism' or container == 'WVM': + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -629,6 +630,14 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) errors = json_data.get('errors') if not formats and errors: From a0949fec081d0badd6a584526cd66e8f170625c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jun 2018 23:57:22 +0700 Subject: [PATCH 030/111] [joj] Relax _VALID_URL (closes #16771) --- youtube_dl/extractor/joj.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index a764023e9..d9f8dbfd2 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -18,7 +18,7 @@ class JojIE(InfoExtractor): joj:| https?://media\.joj\.sk/embed/ ) - (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + (?P<id>[^/?#^]+) ''' _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', @@ -29,16 +29,24 @@ class JojIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3118, } + }, { + 'url': 'https://media.joj.sk/embed/9i1cxv', + 'only_matching': True, }, { 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', 'only_matching': True, + }, { + 'url': 'joj:9i1cxv', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - webpage) + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', + webpage)] def _real_extract(self, url): video_id = self._match_id(url) From c306f076ec81334b458f61b2a4ae683a9e732d06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jun 2018 02:17:14 +0700 Subject: [PATCH 031/111] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 93dc40d8c..327580328 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + version 2018.06.19 Core From 1f6cc5807ec69584664388b8edfaf6b3ae442cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jun 2018 02:26:02 +0700 Subject: [PATCH 032/111] release 2018.06.25 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d254678b5..128e6e681 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.19*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.19** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.19 +[debug] youtube-dl version 2018.06.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 327580328..8eb7469d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.25 Extractors * [joj] Relax URL regular expression (#16771) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 432a7ba93..a78fabb02 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -290,7 +290,6 @@ - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - - **foxnews:insider** - **FoxSports** - **france2.fr:generation-what** - **FranceCulture** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dd4795cd1..8fbafd6a1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.19' +__version__ = '2018.06.25' From c3bcd206eb031de30179c88ac7acd806a477ceae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jun 2018 00:01:06 +0700 Subject: [PATCH 033/111] [porncom] Fix extraction (closes #16808) --- youtube_dl/extractor/porncom.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 60ade06da..5726cab3a 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -43,7 +43,8 @@ class PornComIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), webpage, 'config', default='{}'), display_id, transform_source=js_to_json, fatal=False) @@ -69,7 +70,7 @@ class PornComIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), } for format_url, height, filesize in re.findall( - r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', + r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', webpage)] thumbnail = None duration = None From 7b393f9cc5dc4790bcb623c768fa4a3046ef80bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jun 2018 04:29:11 +0700 Subject: [PATCH 034/111] [svt] Improve extraction and add support for pages (closes #16802) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 11 --- youtube_dl/extractor/svt.py | 117 ++++++++++++++++++++++++----- 3 files changed, 98 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 27ece3b53..f2377521b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1040,6 +1040,7 @@ from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, + SVTPageIE, SVTPlayIE, SVTSeriesIE, ) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d71cb9050..aa04905ed 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1395,17 +1395,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # SVT embed - { - 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', - 'info_dict': { - 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', - 'duration': 27, - 'age_limit': 0, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index f71eab8b2..0901c3163 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -12,6 +12,8 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + orderedSet, + strip_or_none, try_get, urljoin, compat_str, @@ -137,7 +139,12 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + (?: + svt:(?P<svt_id>[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -164,10 +171,40 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/kanaler/svt1', 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, }] + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, svt_id = mobj.group('id', 'svt_id') + + if svt_id: + return self._extract_by_video_id(svt_id) webpage = self._download_webpage(url, video_id) @@ -179,10 +216,6 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) - def adjust_title(info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -193,24 +226,14 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - adjust_title(info_dict) + self._adjust_title(info_dict) return info_dict - video_id = self._search_regex( + svt_id = self._search_regex( r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id', default=None) + webpage, 'video id') - if video_id: - data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, - video_id, headers=self.geo_verification_headers()) - info_dict = self._extract_video(data, video_id) - if not info_dict.get('title'): - info_dict['title'] = re.sub( - r'\s*\|\s*.+?$', '', - info_dict.get('episode') or self._og_search_title(webpage)) - adjust_title(info_dict) - return info_dict + return self._extract_by_video_id(svt_id, webpage) class SVTSeriesIE(SVTPlayBaseIE): @@ -292,3 +315,57 @@ class SVTSeriesIE(SVTPlayBaseIE): return self.playlist_result( entries, series_id, title, metadata.get('description')) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'info_dict': { + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'info_dict': { + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + }, + 'playlist_count': 1, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] + + title = strip_or_none(self._og_search_title(webpage, default=None)) + + return self.playlist_result(entries, playlist_id, title) From acbd0ff5df5ff9d69e6707ea4fa3e3b4f9cc6528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 00:35:05 +0700 Subject: [PATCH 035/111] [dctptv] Restore extraction based on REST API (closes #16850) --- youtube_dl/extractor/dctp.py | 82 ++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 3a6d0560e..dc0c41b8a 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -5,13 +5,15 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, - unified_strdate, + int_or_none, + unified_timestamp, ) class DctpTvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ + # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', @@ -19,31 +21,49 @@ class DctpTvIE(InfoExtractor): 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', - 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 71.24, + 'timestamp': 1302172322, + 'upload_date': '20110407', }, 'params': { # rtmp download 'skip_download': True, }, - } + }, { + # 16x9 + 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', + 'only_matching': True, + }] + + _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com' def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + version = self._download_json( + '%s/version.json' % self._BASE_URL, display_id, + 'Downloading version JSON') - video_id = self._html_search_meta( - 'DC.identifier', webpage, 'video id', - default=None) or self._search_regex( - r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') + restapi_base = '%s/%s/restapi' % ( + self._BASE_URL, version['version_name']) - title = self._og_search_title(webpage) + info = self._download_json( + '%s/slugs/%s.json' % (restapi_base, display_id), display_id, + 'Downloading video info JSON') + + media = self._download_json( + '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])), + display_id, 'Downloading media JSON') + + uuid = media['uuid'] + title = media['title'] + ratio = '16x9' if media.get('is_wide') else '4x3' + play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) servers = self._download_json( 'http://www.dctp.tv/streaming_servers/', display_id, - note='Downloading server list', fatal=False) + note='Downloading server list JSON', fatal=False) if servers: endpoint = next( @@ -60,27 +80,35 @@ class DctpTvIE(InfoExtractor): formats = [{ 'url': endpoint, 'app': app, - 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'play_path': play_path, 'page_url': url, - 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', 'ext': 'flv', }] - description = self._html_search_meta('DC.description', webpage) - upload_date = unified_strdate( - self._html_search_meta('DC.date.created', webpage)) - thumbnail = self._og_search_thumbnail(webpage) - duration = float_or_none(self._search_regex( - r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', - default=None), scale=1000) + thumbnails = [] + images = media.get('images') + if isinstance(images, list): + for image in images: + if not isinstance(image, dict): + continue + image_url = image.get('url') + if not image_url or not isinstance(image_url, compat_str): + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': uuid, 'display_id': display_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'duration': duration, + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description') or media.get('teaser'), + 'timestamp': unified_timestamp(media.get('created')), + 'duration': float_or_none(media.get('duration_in_ms'), scale=1000), + 'thumbnails': thumbnails, + 'formats': formats, } From d4a24f4091a622b808ff621e78b5cfd0db3c8c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 01:09:14 +0700 Subject: [PATCH 036/111] Prefer ffmpeg over avconv by default (closes #8622) --- youtube_dl/YoutubeDL.py | 4 ++-- youtube_dl/options.py | 4 ++-- youtube_dl/postprocessor/ffmpeg.py | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a405c5ca..38ba43a97 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -305,8 +305,8 @@ class YoutubeDL(object): http_chunk_size. The following options are used by the post processors: - prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, - otherwise prefer avconv. + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. postprocessor_args: A list of additional command-line arguments for the postprocessor. diff --git a/youtube_dl/options.py b/youtube_dl/options.py index e83d546a0..e7d8e8910 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -841,11 +841,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors (default)') + help='Prefer avconv over ffmpeg for running the postprocessors') postproc.add_option( '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors') + help='Prefer ffmpeg over avconv for running the postprocessors (default)') postproc.add_option( '--ffmpeg-location', '--avconv-location', metavar='PATH', dest='ffmpeg_location', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3ea1afcf3..757b496a1 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -77,7 +77,7 @@ class FFmpegPostProcessor(PostProcessor): def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = False + prefer_ffmpeg = True self.basename = None self.probe_basename = None @@ -85,7 +85,7 @@ class FFmpegPostProcessor(PostProcessor): self._paths = None self._versions = None if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) location = self._downloader.params.get('ffmpeg_location') if location is not None: if not os.path.exists(location): @@ -117,19 +117,19 @@ class FFmpegPostProcessor(PostProcessor): (p, get_exe_version(p, args=['-version'])) for p in programs) self._paths = dict((p, p) for p in programs) - if prefer_ffmpeg: - prefs = ('ffmpeg', 'avconv') - else: + if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') for p in prefs: if self._versions[p]: self.basename = p break - if prefer_ffmpeg: - prefs = ('ffprobe', 'avprobe') - else: + if prefer_ffmpeg is False: prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') for p in prefs: if self._versions[p]: self.probe_basename = p From 5e8e2fa51f416e227367211ab937dfea17f89f57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 01:25:05 +0700 Subject: [PATCH 037/111] [extractor/common] Use source URL as Referer for HTML5 entries (closes #16849) --- youtube_dl/extractor/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 394f34372..f3fec160d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2437,6 +2437,8 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) + for f in media_info['formats']: + f.setdefault('http_headers', {})['Referer'] = base_url if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries From 9cf648c92bcc131db7d7fad673864bba06121482 Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Mon, 18 Jun 2018 11:50:06 +0200 Subject: [PATCH 038/111] [mediaset] Add support for new videos --- youtube_dl/extractor/mediaset.py | 60 ++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9760eafd5..76a2ae125 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -10,6 +10,7 @@ from ..utils import ( parse_duration, try_get, unified_strdate, + ExtractorError ) @@ -42,6 +43,22 @@ class MediasetIE(InfoExtractor): 'categories': ['reality'], }, 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', + 'md5': '1276f966ac423d16ba255ce867de073e', + 'info_dict': { + 'id': '846685', + 'ext': 'mp4', + 'title': 'Puntata del 25 maggio', + 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6565, + 'creator': 'mediaset', + 'upload_date': '20180525', + 'series': 'Matrix', + 'categories': ['infotainment'], + }, + 'expected_warnings': ['is not a supported codec'], }, { # clip 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', @@ -70,18 +87,29 @@ class MediasetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + media_info = self._download_json( + 'https://www.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading media info', query={ + 'id': video_id + })['video'] + + media_id = try_get(media_info, lambda x: x['guid']) or video_id + video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn.aspx', + 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', video_id, 'Downloading video CDN JSON', query={ - 'streamid': video_id, + 'streamid': media_id, 'format': 'json', })['videoList'] formats = [] for format_url in video_list: if '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) + try: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + except ExtractorError: + pass else: formats.append({ 'url': format_url, @@ -89,30 +117,24 @@ class MediasetIE(InfoExtractor): }) self._sort_formats(formats) - mediainfo = self._download_json( - 'http://plr.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading video info JSON', query={ - 'id': video_id, - })['video'] - - title = mediainfo['title'] + title = media_info['title'] creator = try_get( - mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + media_info, lambda x: x['brand-info']['publisher'], compat_str) category = try_get( - mediainfo, lambda x: x['brand-info']['category'], compat_str) + media_info, lambda x: x['brand-info']['category'], compat_str) categories = [category] if category else None return { 'id': video_id, 'title': title, - 'description': mediainfo.get('short-description'), - 'thumbnail': mediainfo.get('thumbnail'), - 'duration': parse_duration(mediainfo.get('duration')), + 'description': media_info.get('short-description'), + 'thumbnail': media_info.get('thumbnail'), + 'duration': parse_duration(media_info.get('duration')), 'creator': creator, - 'upload_date': unified_strdate(mediainfo.get('production-date')), - 'webpage_url': mediainfo.get('url'), - 'series': mediainfo.get('brand-value'), + 'upload_date': unified_strdate(media_info.get('production-date')), + 'webpage_url': media_info.get('url'), + 'series': media_info.get('brand-value'), 'categories': categories, 'formats': formats, } From 267d81962a0709f15f82f96b7aadbb5473a06992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 Jun 2018 02:16:44 +0700 Subject: [PATCH 039/111] [mediaset] Fix issues and extract all formats (closes #16568) --- youtube_dl/extractor/mediaset.py | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 76a2ae125..9f2b60dcc 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -10,7 +10,6 @@ from ..utils import ( parse_duration, try_get, unified_strdate, - ExtractorError ) @@ -58,7 +57,7 @@ class MediasetIE(InfoExtractor): 'series': 'Matrix', 'categories': ['infotainment'], }, - 'expected_warnings': ['is not a supported codec'], + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', @@ -87,13 +86,14 @@ class MediasetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - media_info = self._download_json( + video = self._download_json( 'https://www.video.mediaset.it/html/metainfo.sjson', video_id, 'Downloading media info', query={ 'id': video_id })['video'] - media_id = try_get(media_info, lambda x: x['guid']) or video_id + title = video['title'] + media_id = video.get('guid') or video_id video_list = self._download_json( 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', @@ -104,12 +104,17 @@ class MediasetIE(InfoExtractor): formats = [] for format_url in video_list: - if '.ism' in format_url: - try: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - except ExtractorError: - pass + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'ism' or '.ism' in format_url: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) else: formats.append({ 'url': format_url, @@ -117,24 +122,23 @@ class MediasetIE(InfoExtractor): }) self._sort_formats(formats) - title = media_info['title'] - creator = try_get( - media_info, lambda x: x['brand-info']['publisher'], compat_str) + video, lambda x: x['brand-info']['publisher'], compat_str) category = try_get( - media_info, lambda x: x['brand-info']['category'], compat_str) + video, lambda x: x['brand-info']['category'], compat_str) categories = [category] if category else None return { 'id': video_id, 'title': title, - 'description': media_info.get('short-description'), - 'thumbnail': media_info.get('thumbnail'), - 'duration': parse_duration(media_info.get('duration')), + 'description': video.get('short-description'), + 'thumbnail': video.get('thumbnail'), + 'duration': parse_duration(video.get('duration')), 'creator': creator, - 'upload_date': unified_strdate(media_info.get('production-date')), - 'webpage_url': media_info.get('url'), - 'series': media_info.get('brand-value'), + 'upload_date': unified_strdate(video.get('production-date')), + 'webpage_url': video.get('url'), + 'series': video.get('brand-value'), + 'season': video.get('season'), 'categories': categories, 'formats': formats, } From 2160768a215849e82a167912cb8f0aa054e87d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 Jun 2018 23:39:56 +0700 Subject: [PATCH 040/111] [npo] Fix typo (closes #16872) --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index cb8319f0d..c2cb85a73 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -282,7 +282,7 @@ class NPOIE(NPOBaseIE): video_url = stream_info.get('url') if not video_url or video_url in urls: continue - urls.add(item_url) + urls.add(video_url) if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, ext='mp4', From eca1f0d115e6a2712ff0d5f6b25e3ded5e52db71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Jul 2018 02:00:16 +0700 Subject: [PATCH 041/111] [extractor/common] Properly escape % in MPD templates (closes #16867) --- youtube_dl/extractor/common.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f3fec160d..78f053f18 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2106,7 +2106,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) From 973b6ceebbf0c79086cbf3203a8a8c79daf0b1ba Mon Sep 17 00:00:00 2001 From: coreynicholson <coreynicholson@users.noreply.github.com> Date: Sun, 1 Jul 2018 15:19:17 +0100 Subject: [PATCH 042/111] [vlive] Fix live streams extraction --- youtube_dl/extractor/vlive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 64d0224e6..0b5165fd0 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_FIELD = 'video params' @@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor): def _live(self, video_id, webpage): init_page = self._download_webpage( - 'http://www.vlive.tv/video/init/view', + 'https://www.vlive.tv/video/init/view', video_id, note='Downloading live webpage', data=urlencode_postdata({'videoSeq': video_id}), headers={ - 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, 'Content-Type': 'application/x-www-form-urlencoded' }) From 8cee692b8b66322e4c1a0d37baceb9e4c49a3f8e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 1 Jul 2018 22:32:59 +0100 Subject: [PATCH 043/111] [go90] detect geo restriction error and pass geo verification headers(closes #16874) --- youtube_dl/extractor/go90.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 35dde42d0..6f8c56a93 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -28,14 +29,27 @@ class Go90IE(InfoExtractor): 'age_limit': 14, } } + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://www.go90.com/api/view/items/' + video_id, - video_id, headers={ + + try: + headers = self.geo_verification_headers() + headers.update({ 'Content-Type': 'application/json; charset=utf-8', - }, data=b'{"client":"web","device_type":"pc"}') + }) + video_data = self._download_json( + 'https://www.go90.com/api/view/items/' + video_id, video_id, + headers=headers, data=b'{"client":"web","device_type":"pc"}') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + message = self._parse_json(e.cause.read().decode(), None)['error']['message'] + if 'region unavailable' in message: + self.raise_geo_restricted(countries=['US']) + raise ExtractorError(message, expected=True) + raise + if video_data.get('requires_drm'): raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] From db5debf313bd2ab99016f2c5b389dbf9ffae3dfb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 1 Jul 2018 22:41:11 +0100 Subject: [PATCH 044/111] [go90] add support for embed urls(closes #16873) --- youtube_dl/extractor/go90.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 6f8c56a93..c3ea717bc 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -15,8 +15,8 @@ from ..utils import ( class Go90IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'https://www.go90.com/videos/84BUqjLpf9D', 'md5': 'efa7670dbbbf21a7b07b360652b24a32', 'info_dict': { @@ -28,7 +28,10 @@ class Go90IE(InfoExtractor): 'upload_date': '20170411', 'age_limit': 14, } - } + }, { + 'url': 'https://www.go90.com/embed/261MflWkD3N', + 'only_matching': True, + }] _GEO_BYPASS = False def _real_extract(self, url): From 5621c3222eaab29d5ca705c8dac2d0bc1eb785d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Jul 2018 02:47:09 +0700 Subject: [PATCH 045/111] [lynda] Simplify login and improve error capturing (#16891) --- youtube_dl/extractor/lynda.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index f5c7abc13..1316cddb6 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -44,21 +44,15 @@ class LyndaBaseIE(InfoExtractor): form_data = self._hidden_inputs(form_html) form_data.update(extra_form_data) - try: - response = self._download_json( - action_url, None, note, - data=urlencode_postdata(form_data), - headers={ - 'Referer': referrer_url, - 'X-Requested-With': 'XMLHttpRequest', - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: - response = self._parse_json(e.cause.read().decode('utf-8'), None) - self._check_error(response, ('email', 'password')) - raise + response = self._download_json( + action_url, None, note, + data=urlencode_postdata(form_data), + headers={ + 'Referer': referrer_url, + 'X-Requested-With': 'XMLHttpRequest', + }, expected_status=(418, 500, )) - self._check_error(response, 'ErrorMessage') + self._check_error(response, ('email', 'password', 'ErrorMessage')) return response, action_url From 836ef4840f00d7e07c826de23f3a09675dc532d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:48:40 +0700 Subject: [PATCH 046/111] [pluralsight] Switch to graphql (closes #16889, closes #16899) --- youtube_dl/extractor/pluralsight.py | 120 +++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index a207ca9cb..1257841e4 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -27,6 +27,60 @@ from ..utils import ( class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + def _download_course(self, course_id, url, display_id): try: return self._download_course_rpc(course_id, url, display_id) @@ -39,20 +93,14 @@ class PluralsightBaseIE(InfoExtractor): def _download_course_rpc(self, course_id, url, display_id): response = self._download_json( - '%s/player/functions/rpc' % self._API_BASE, display_id, - 'Downloading course JSON', - data=json.dumps({ - 'fn': 'bootstrapPlayer', - 'payload': { - 'courseId': course_id, - }, - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) - course = try_get(response, lambda x: x['payload']['course'], dict) + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) if course: return course @@ -90,6 +138,28 @@ class PluralsightIE(PluralsightBaseIE): 'only_matching': True, }] + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + def _real_initialize(self): self._login() @@ -277,7 +347,7 @@ class PluralsightIE(PluralsightBaseIE): f = QUALITIES[quality].copy() clip_post = { 'author': author, - 'includeCaptions': False, + 'includeCaptions': 'false', 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', @@ -286,11 +356,23 @@ class PluralsightIE(PluralsightBaseIE): 'quality': '%dx%d' % (f['width'], f['height']), } format_id = '%s-%s' % (ext, quality) - viewclip = self._download_json( - '%s/video/clips/viewclip' % self._API_BASE, display_id, - 'Downloading %s viewclip JSON' % format_id, fatal=False, - data=json.dumps(clip_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see From 24d26ab3808619b209e6a22c505a46eaa781c614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:49:03 +0700 Subject: [PATCH 047/111] [lynda] PEP 8 --- youtube_dl/extractor/lynda.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 1316cddb6..4ba61cd8a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urlparse, ) From d5de0f21b97bb26c8c558e33b40202a6e02f7b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:57:17 +0700 Subject: [PATCH 048/111] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 8eb7469d4..b1045d7d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version <unreleased> + +Core +* [extractor/common] Properly escape % in MPD templates (#16867) +* [extractor/common] Use source URL as Referer for HTML5 entries (16849) +* Prefer ffmpeg over avconv by default (#8622) + +Extractors +* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) +* [lynda] Simplify login and improve error capturing (#16891) ++ [go90] Add support for embed URLs (#16873) +* [go90] Detect geo restriction error and pass geo verification headers + (#16874) +* [vlive] Fix live streams extraction (#16871) +* [npo] Fix typo (#16872) ++ [mediaset] Add support for new videos and extract all formats (#16568) +* [dctptv] Restore extraction based on REST API (#16850) +* [svt] Improve extraction and add support for pages (#16802) +* [porncom] Fix extraction (#16808) + + version 2018.06.25 Extractors From 689af4960e16c63c1f26933095d2f8c8b76f119f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:59:21 +0700 Subject: [PATCH 049/111] release 2018.07.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 4 ++-- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 128e6e681..453983f84 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.25** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.25 +[debug] youtube-dl version 2018.07.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index b1045d7d4..c33bf7777 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.04 Core * [extractor/common] Properly escape % in MPD templates (#16867) diff --git a/README.md b/README.md index 499a0c206..09e62899a 100644 --- a/README.md +++ b/README.md @@ -427,9 +427,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors (default) - --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running the + postprocessors (default) --ffmpeg-location PATH Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a78fabb02..19dc984dc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -813,6 +813,7 @@ - **StretchInternet** - **SunPorno** - **SVT** + - **SVTPage** - **SVTPlay**: SVT Play and Öppet arkiv - **SVTSeries** - **SWRMediathek** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8fbafd6a1..4cf97291b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.25' +__version__ = '2018.07.04' From 9a6628aaf918afdcdf4c661f474e318207155780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 00:36:35 +0700 Subject: [PATCH 050/111] [youtube] Improve login error handling (closes #13822) --- youtube_dl/extractor/youtube.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 89c8b7f8d..117a57911 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,13 +178,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): warn('Unable to extract result entry') return False - tfa = try_get(res, lambda x: x[0][0], list) - if tfa: - tfa_str = try_get(tfa, lambda x: x[2], compat_str) - if tfa_str == 'TWO_STEP_VERIFICATION': + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': # SEND_SUCCESS - TFA code has been successfully sent to phone # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(tfa, lambda x: x[5], compat_str) + status = try_get(login_challenge, lambda x: x[5], compat_str) if status == 'QUOTA_EXCEEDED': warn('Exceeded the limit of TFA codes, try later') return False @@ -228,6 +228,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): check_cookie_url = try_get( tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) + return False else: check_cookie_url = try_get(res, lambda x: x[2], compat_str) From 94fef94d9c8571db255b0f3694617f5cd56825b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 02:14:06 +0700 Subject: [PATCH 051/111] [dplayit] Fix extraction (closes #16901) --- youtube_dl/extractor/dplay.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index fe47f6dce..a95e3213c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -21,6 +21,7 @@ from ..utils import ( unified_strdate, unified_timestamp, update_url_query, + urljoin, USER_AGENTS, ) @@ -310,9 +311,11 @@ class DPlayItIE(InfoExtractor): if not info: info_url = self._search_regex( - r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', - webpage, 'info url') + (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'), + webpage, 'info url', group='url') + info_url = urljoin(url, info_url) video_id = info_url.rpartition('/')[-1] try: @@ -322,6 +325,8 @@ class DPlayItIE(InfoExtractor): 'dplayit_token').value, 'Referer': url, }) + if isinstance(info, compat_str): + info = self._parse_json(info, display_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): info = self._parse_json(e.cause.read().decode('utf-8'), display_id) From e15141adaee67d1e9b9e97c17b73fe4c052c3449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 02:14:50 +0700 Subject: [PATCH 052/111] [dplayit] Sort formats --- youtube_dl/extractor/dplay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a95e3213c..ebf59512c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -342,6 +342,7 @@ class DPlayItIE(InfoExtractor): formats = self._extract_m3u8_formats( hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) series = self._html_search_regex( r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', From 1ed0b2f74d8337b9625716f9069233a341edd22e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 02:22:15 +0700 Subject: [PATCH 053/111] [watchbox] Fix extraction (closes #16904) --- youtube_dl/extractor/watchbox.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index be0bcba15..d99313080 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -67,11 +67,12 @@ class WatchBoxIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - source = self._parse_json( + source = (self._parse_json( self._search_regex( - r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', + r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', default='{}'), - video_id, transform_source=js_to_json, fatal=False) or {} + video_id, transform_source=js_to_json, + fatal=False) or {}).get('source') or {} video_id = compat_str(source.get('videoId') or video_id) From 4e71dfd819ccef91a056edc5bf6ca8cec9f2ad4f Mon Sep 17 00:00:00 2001 From: Aaron Brager <getaaron@gmail.com> Date: Thu, 5 Jul 2018 10:17:18 -0500 Subject: [PATCH 054/111] [README.md] Rename OS X to macOS --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 09e62899a..6d49d6a4f 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms # INSTALLATION -To install it right away for all UNIX users (Linux, OS X, etc.), type: +To install it right away for all UNIX users (Linux, macOS, etc.), type: sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl @@ -35,7 +35,7 @@ You can also use pip: This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. -OS X users can install youtube-dl with [Homebrew](https://brew.sh/): +macOS users can install youtube-dl with [Homebrew](https://brew.sh/): brew install youtube-dl @@ -442,7 +442,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` From 47421507883850dc679dc23eb44a615f18282bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Jul 2018 23:49:36 +0700 Subject: [PATCH 055/111] [funk] Fix extraction (closes #16918) --- youtube_dl/extractor/funk.py | 64 ++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index 0ff058619..76c20ffac 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from .nexx import NexxIE +from ..compat import compat_str from ..utils import ( int_or_none, try_get, @@ -12,6 +13,19 @@ from ..utils import ( class FunkBaseIE(InfoExtractor): + _HEADERS = { + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4', + } + _AUTH = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4' + + @staticmethod + def _make_headers(referer): + headers = FunkBaseIE._HEADERS.copy() + headers['Referer'] = referer + return headers + def _make_url_result(self, video): return { '_type': 'url_transparent', @@ -48,19 +62,19 @@ class FunkMixIE(FunkBaseIE): lists = self._download_json( 'https://www.funk.net/api/v3.1/curation/curatedLists/', - mix_id, headers={ - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI', - 'Referer': url, - }, query={ + mix_id, headers=self._make_headers(url), query={ 'size': 100, - })['result']['lists'] + })['_embedded']['curatedListList'] metas = next( l for l in lists if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] video = next( meta['videoDataDelegate'] - for meta in metas if meta.get('alias') == alias) + for meta in metas + if try_get( + meta, lambda x: x['videoDataDelegate']['alias'], + compat_str) == alias) return self._make_url_result(video) @@ -104,25 +118,39 @@ class FunkChannelIE(FunkBaseIE): channel_id = mobj.group('id') alias = mobj.group('alias') - headers = { - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', - 'Referer': url, - } + headers = self._make_headers(url) video = None - by_id_list = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, - headers=headers, query={ - 'ids': alias, + # Id-based channels are currently broken on their side: webplayer + # tries to process them via byChannelAlias endpoint and fails + # predictably. + by_channel_alias = self._download_json( + 'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s' + % channel_id, + 'Downloading byChannelAlias JSON', headers=headers, query={ + 'size': 100, }, fatal=False) - if by_id_list: - video = try_get(by_id_list, lambda x: x['result'][0], dict) + if by_channel_alias: + video_list = try_get( + by_channel_alias, lambda x: x['_embedded']['videoList'], list) + if video_list: + video = next(r for r in video_list if r.get('alias') == alias) + + if not video: + by_id_list = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/byIdList', + channel_id, 'Downloading byIdList JSON', headers=headers, + query={ + 'ids': alias, + }, fatal=False) + if by_id_list: + video = try_get(by_id_list, lambda x: x['result'][0], dict) if not video: results = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, - headers=headers, query={ + 'https://www.funk.net/api/v3.0/content/videos/filter', + channel_id, 'Downloading filter JSON', headers=headers, query={ 'channelId': channel_id, 'size': 100, })['result'] From 6868d272e5c4a85ba4143bacbb7269dac099c55d Mon Sep 17 00:00:00 2001 From: Luca Cherubin <luca.cherubin@gmail.com> Date: Thu, 26 Apr 2018 20:33:09 +0100 Subject: [PATCH 056/111] [frontendmasters] Add extractor --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/frontendmaster.py | 271 +++++++++++++++++++++++++ 2 files changed, 275 insertions(+) create mode 100644 youtube_dl/extractor/frontendmaster.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f2377521b..265b4aa9d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -390,6 +390,10 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE +from .frontendmaster import ( + FrontEndMasterIE, + FrontEndMasterCourseIE +) from .funimation import FunimationIE from .funk import ( FunkMixIE, diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py new file mode 100644 index 000000000..21e382da9 --- /dev/null +++ b/youtube_dl/extractor/frontendmaster.py @@ -0,0 +1,271 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import collections +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse) +from ..utils import ( + ExtractorError, + urlencode_postdata, + qualities, unescapeHTML) + + +class FrontEndMasterBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' + _VIDEO_BASE = 'http://www.frontendmasters.com/courses' + _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts' + _COOKIES_BASE = 'https://api.frontendmasters.com' + _LOGIN_URL = 'https://frontendmasters.com/login/' + + _QUALITIES_PREFERENCE = ('low', 'medium', 'high') + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'medium': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + AllowedQuality = collections.namedtuple('AllowedQuality', + ['ext', 'qualities']) + _ALLOWED_QUALITIES = [ + AllowedQuality('webm', ['low', 'medium', 'high']), + AllowedQuality('mp4', ['low', 'medium', 'high']) + ] + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'} + ) + + error = self._search_regex( + r'<div[^>]+class=["\']Message MessageAlert["\'][^>]*>' + r'([^<]+)' + r'</div>', + response, 'error message', default=None) + + if error: + raise ExtractorError('Unable to login: %s' % unescapeHTML(error), + expected=True) + + def _download_course(self, course_id, url): + response = self._download_json( + '%s/%s' % (self._API_BASE, course_id), course_id, + 'Downloading course JSON', + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + return response + + @staticmethod + def _pair_section_video_element(lesson_elements): + sections = {} + current_section = None + current_section_number = 0 + for elem in lesson_elements: + if not isinstance(elem, int): + elem_name = elem + if not isinstance(elem_name, str): + # convert unicode to str + elem_name = elem.encode('utf-8') + (current_section, current_section_number) = \ + (elem_name, current_section_number + 1) + else: + if current_section: + sections[elem] = (current_section, current_section_number) + + return sections + + +class FrontEndMasterIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ + r'(?P<courseid>[a-z\-]+)/' \ + r'(?P<id>[a-z\-]+)' + + _NETRC_MACHINE = 'frontendmasters' + + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', + 'info_dict': { + 'id': 'tools', + 'title': 'Tools', + 'display_id': 'tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', + 'ext': 'mp4' + }, + 'skip': 'Requires FrontendMasters account credentials', + } + + def _get_subtitles(self, video_hash, video_id): + captions = self._download_webpage( + '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id, + fatal=False) + if captions: + return { + 'en': [{ + 'ext': 'vtt', + 'data': captions + }] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + course_id = mobj.group('courseid') + + course_json_content = self._download_course(course_id=course_id, + url=url) + + # Necessary to get mandatory informations like title and video_url + lesson_index = course_json_content.get('lessonSlugs').index(video_id) + lesson_hash = course_json_content.get('lessonHashes')[lesson_index] + lesson_data = course_json_content.get('lessonData')[lesson_hash] + # This is necessary to get the link for the video + lesson_source_base = lesson_data['sourceBase'] + + lesson_title = lesson_data['title'] + + # Some optional fields + lesson_description = lesson_data.get('description') + lesson_index = lesson_data.get('index') + lesson_slug = lesson_data.get('slug') + lesson_thumbnail_url = lesson_data.get('thumbnail') + lesson_section_elements = course_json_content.get('lessonElements') + + try: + course_sections_pairing = self._pair_section_video_element( + lesson_section_elements) + + lesson_section = \ + course_sections_pairing.get(lesson_index)[0] + + lesson_section_number = \ + course_sections_pairing.get(lesson_index)[1] + except Exception: + lesson_section = None + lesson_section_number = None + + video_request_url = '%s/source' + video_request_headers = { + 'origin': 'https://frontendmasters.com', + 'referer': lesson_source_base, + } + + quality_key = qualities(self._QUALITIES_PREFERENCE) + + formats = [] + for ext, qualities_ in self._ALLOWED_QUALITIES: + for quality in qualities_: + f = self._QUALITIES[quality].copy() + video_request_params = { + 'r': f['height'], + 'f': ext + } + video_response = self._download_json( + video_request_url % lesson_source_base, video_id, + query=video_request_params, headers=video_request_headers) + + video_url = video_response.get('url') + clip_f = f.copy() + clip_f.update({ + 'url': video_url, + 'ext': ext, + 'format_id': '%s-%s' % (ext, quality), + 'quality': quality_key(quality), + 'height': f['height'] + }) + formats.append(clip_f) + + self._sort_formats(formats) + + subtitles = self.extract_subtitles(lesson_hash, video_id) + + return { + 'id': video_id, + 'display_id': lesson_slug, + 'title': lesson_title, + 'description': lesson_description, + 'chapter': lesson_section, + 'chapter_number': lesson_section_number, + 'thumbnail': lesson_thumbnail_url, + 'formats': formats, + 'subtitles': subtitles + } + + +class FrontEndMasterCourseIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters:course' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<courseid>[a-z\-]+)/?$' + + _NETRC_MACHINE = 'frontendmasters' + + _TEST = { + 'url': 'https://frontendmasters.com/courses/javascript-basics/', + 'info_dict': { + 'id': 'javascript-basics', + 'title': 'Introduction to JavaScript Programming', + 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' + }, + 'playlist_count': 19, + 'skip': 'Requires FrontendMasters account credentials' + } + + @classmethod + def suitable(cls, url): + return False if FrontEndMasterIE.suitable(url) else super(FrontEndMasterBaseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('courseid') + course_json_content = self._download_course(course_id=course_id, + url=url) + + title = course_json_content.get('title') + description = course_json_content.get('description') + course_display_id = course_json_content.get('slug') + + videos_data = course_json_content.get('lessonData').values() + videos_data = sorted(videos_data, key=lambda video: video.get('index')) + + entries = [] + for video in videos_data: + video_slug = video.get('slug') + clip_url = '%s/%s/%s' % ( + self._VIDEO_BASE, course_display_id, video_slug) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': FrontEndMasterIE.ie_key() + }) + + return self.playlist_result(entries, course_id, title, description) From 69fcdb845b9744125161f514cb4166becbae2959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Jul 2018 00:48:23 +0700 Subject: [PATCH 057/111] [frontendmasters] Fix issues and improve extraction (closes #3661, closes #16328) --- youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/frontendmaster.py | 271 ------------------------ youtube_dl/extractor/frontendmasters.py | 262 +++++++++++++++++++++++ 3 files changed, 266 insertions(+), 274 deletions(-) delete mode 100644 youtube_dl/extractor/frontendmaster.py create mode 100644 youtube_dl/extractor/frontendmasters.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 265b4aa9d..ed532d77f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -390,9 +390,10 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE -from .frontendmaster import ( - FrontEndMasterIE, - FrontEndMasterCourseIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE ) from .funimation import FunimationIE from .funk import ( diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py deleted file mode 100644 index 21e382da9..000000000 --- a/youtube_dl/extractor/frontendmaster.py +++ /dev/null @@ -1,271 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import collections -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse) -from ..utils import ( - ExtractorError, - urlencode_postdata, - qualities, unescapeHTML) - - -class FrontEndMasterBaseIE(InfoExtractor): - _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' - _VIDEO_BASE = 'http://www.frontendmasters.com/courses' - _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts' - _COOKIES_BASE = 'https://api.frontendmasters.com' - _LOGIN_URL = 'https://frontendmasters.com/login/' - - _QUALITIES_PREFERENCE = ('low', 'medium', 'high') - _QUALITIES = { - 'low': {'width': 480, 'height': 360}, - 'medium': {'width': 1280, 'height': 720}, - 'high': {'width': 1920, 'height': 1080} - } - - AllowedQuality = collections.namedtuple('AllowedQuality', - ['ext', 'qualities']) - _ALLOWED_QUALITIES = [ - AllowedQuality('webm', ['low', 'medium', 'high']), - AllowedQuality('mp4', ['low', 'medium', 'high']) - ] - - def _real_initialize(self): - self._login() - - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post_url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'} - ) - - error = self._search_regex( - r'<div[^>]+class=["\']Message MessageAlert["\'][^>]*>' - r'([^<]+)' - r'</div>', - response, 'error message', default=None) - - if error: - raise ExtractorError('Unable to login: %s' % unescapeHTML(error), - expected=True) - - def _download_course(self, course_id, url): - response = self._download_json( - '%s/%s' % (self._API_BASE, course_id), course_id, - 'Downloading course JSON', - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) - return response - - @staticmethod - def _pair_section_video_element(lesson_elements): - sections = {} - current_section = None - current_section_number = 0 - for elem in lesson_elements: - if not isinstance(elem, int): - elem_name = elem - if not isinstance(elem_name, str): - # convert unicode to str - elem_name = elem.encode('utf-8') - (current_section, current_section_number) = \ - (elem_name, current_section_number + 1) - else: - if current_section: - sections[elem] = (current_section, current_section_number) - - return sections - - -class FrontEndMasterIE(FrontEndMasterBaseIE): - IE_NAME = 'frontend-masters' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ - r'(?P<courseid>[a-z\-]+)/' \ - r'(?P<id>[a-z\-]+)' - - _NETRC_MACHINE = 'frontendmasters' - - _TEST = { - 'url': 'https://frontendmasters.com/courses/web-development/tools', - 'md5': '7f161159710d6b7016a4f4af6fcb05e2', - 'info_dict': { - 'id': 'tools', - 'title': 'Tools', - 'display_id': 'tools', - 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', - 'ext': 'mp4' - }, - 'skip': 'Requires FrontendMasters account credentials', - } - - def _get_subtitles(self, video_hash, video_id): - captions = self._download_webpage( - '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id, - fatal=False) - if captions: - return { - 'en': [{ - 'ext': 'vtt', - 'data': captions - }] - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - course_id = mobj.group('courseid') - - course_json_content = self._download_course(course_id=course_id, - url=url) - - # Necessary to get mandatory informations like title and video_url - lesson_index = course_json_content.get('lessonSlugs').index(video_id) - lesson_hash = course_json_content.get('lessonHashes')[lesson_index] - lesson_data = course_json_content.get('lessonData')[lesson_hash] - # This is necessary to get the link for the video - lesson_source_base = lesson_data['sourceBase'] - - lesson_title = lesson_data['title'] - - # Some optional fields - lesson_description = lesson_data.get('description') - lesson_index = lesson_data.get('index') - lesson_slug = lesson_data.get('slug') - lesson_thumbnail_url = lesson_data.get('thumbnail') - lesson_section_elements = course_json_content.get('lessonElements') - - try: - course_sections_pairing = self._pair_section_video_element( - lesson_section_elements) - - lesson_section = \ - course_sections_pairing.get(lesson_index)[0] - - lesson_section_number = \ - course_sections_pairing.get(lesson_index)[1] - except Exception: - lesson_section = None - lesson_section_number = None - - video_request_url = '%s/source' - video_request_headers = { - 'origin': 'https://frontendmasters.com', - 'referer': lesson_source_base, - } - - quality_key = qualities(self._QUALITIES_PREFERENCE) - - formats = [] - for ext, qualities_ in self._ALLOWED_QUALITIES: - for quality in qualities_: - f = self._QUALITIES[quality].copy() - video_request_params = { - 'r': f['height'], - 'f': ext - } - video_response = self._download_json( - video_request_url % lesson_source_base, video_id, - query=video_request_params, headers=video_request_headers) - - video_url = video_response.get('url') - clip_f = f.copy() - clip_f.update({ - 'url': video_url, - 'ext': ext, - 'format_id': '%s-%s' % (ext, quality), - 'quality': quality_key(quality), - 'height': f['height'] - }) - formats.append(clip_f) - - self._sort_formats(formats) - - subtitles = self.extract_subtitles(lesson_hash, video_id) - - return { - 'id': video_id, - 'display_id': lesson_slug, - 'title': lesson_title, - 'description': lesson_description, - 'chapter': lesson_section, - 'chapter_number': lesson_section_number, - 'thumbnail': lesson_thumbnail_url, - 'formats': formats, - 'subtitles': subtitles - } - - -class FrontEndMasterCourseIE(FrontEndMasterBaseIE): - IE_NAME = 'frontend-masters:course' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<courseid>[a-z\-]+)/?$' - - _NETRC_MACHINE = 'frontendmasters' - - _TEST = { - 'url': 'https://frontendmasters.com/courses/javascript-basics/', - 'info_dict': { - 'id': 'javascript-basics', - 'title': 'Introduction to JavaScript Programming', - 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' - }, - 'playlist_count': 19, - 'skip': 'Requires FrontendMasters account credentials' - } - - @classmethod - def suitable(cls, url): - return False if FrontEndMasterIE.suitable(url) else super(FrontEndMasterBaseIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('courseid') - course_json_content = self._download_course(course_id=course_id, - url=url) - - title = course_json_content.get('title') - description = course_json_content.get('description') - course_display_id = course_json_content.get('slug') - - videos_data = course_json_content.get('lessonData').values() - videos_data = sorted(videos_data, key=lambda video: video.get('index')) - - entries = [] - for video in videos_data: - video_slug = video.get('slug') - clip_url = '%s/%s/%s' % ( - self._VIDEO_BASE, course_display_id, video_slug) - entries.append({ - '_type': 'url_transparent', - 'url': clip_url, - 'ie_key': FrontEndMasterIE.ie_key() - }) - - return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py new file mode 100644 index 000000000..770db46d0 --- /dev/null +++ b/youtube_dl/extractor/frontendmasters.py @@ -0,0 +1,262 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_duration, + urlencode_postdata, +) + + +class FrontendMastersBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki' + _LOGIN_URL = 'https://frontendmasters.com/login/' + + _NETRC_MACHINE = 'frontendmasters' + + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'mid': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + # Successful login + if any(p in response for p in ( + 'wp-login.php?action=logout', '>Logout')): + return + + error = self._html_search_regex( + r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<', + response, 'error message', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class FrontendMastersPageBaseIE(FrontendMastersBaseIE): + def _download_course(self, course_name, url): + return self._download_json( + '%s/courses/%s' % (self._API_BASE, course_name), course_name, + 'Downloading course JSON', headers={'Referer': url}) + + @staticmethod + def _extract_chapters(course): + chapters = [] + lesson_elements = course.get('lessonElements') + if isinstance(lesson_elements, list): + chapters = [e for e in lesson_elements if isinstance(e, compat_str)] + return chapters + + @staticmethod + def _extract_lesson(chapters, lesson_id, lesson): + title = lesson.get('title') or lesson_id + display_id = lesson.get('slug') + description = lesson.get('description') + thumbnail = lesson.get('thumbnail') + + chapter_number = None + index = lesson.get('index') + element_index = lesson.get('elementIndex') + if (isinstance(index, int) and isinstance(element_index, int) and + index < element_index): + chapter_number = element_index - index + chapter = (chapters[chapter_number - 1] + if chapter_number - 1 < len(chapters) else None) + + duration = None + timestamp = lesson.get('timestamp') + if isinstance(timestamp, compat_str): + mobj = re.search( + r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})', + timestamp) + if mobj: + duration = parse_duration(mobj.group('end')) - parse_duration( + mobj.group('start')) + + return { + '_type': 'url_transparent', + 'url': 'frontendmasters:%s' % lesson_id, + 'ie_key': FrontendMastersIE.ie_key(), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'chapter': chapter, + 'chapter_number': chapter_number, + } + + +class FrontendMastersIE(FrontendMastersBaseIE): + _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', + 'info_dict': { + 'id': 'a2qogef6ba', + 'ext': 'mp4', + 'title': 'a2qogef6ba', + }, + 'skip': 'Requires FrontendMasters account credentials', + }, { + 'url': 'frontendmasters:a2qogef6ba', + 'only_matching': True, + }] + + def _real_extract(self, url): + lesson_id = self._match_id(url) + + source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id) + + formats = [] + for ext in ('webm', 'mp4'): + for quality in ('low', 'mid', 'high'): + resolution = self._QUALITIES[quality].copy() + format_id = '%s-%s' % (ext, quality) + format_url = self._download_json( + source_url, lesson_id, + 'Downloading %s source JSON' % format_id, query={ + 'f': ext, + 'r': resolution['height'], + }, headers={ + 'Referer': url, + }, fatal=False)['url'] + + if not format_url: + continue + + f = resolution.copy() + f.update({ + 'url': format_url, + 'ext': ext, + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = { + 'en': [{ + 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id), + }] + } + + return { + 'id': lesson_id, + 'title': lesson_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class FrontendMastersLessonIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'info_dict': { + 'id': 'a2qogef6ba', + 'display_id': 'tools', + 'ext': 'mp4', + 'title': 'Tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'chapter': 'Introduction', + 'chapter_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires FrontendMasters account credentials', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_name, lesson_name = mobj.group('course_name', 'lesson_name') + + course = self._download_course(course_name, url) + + lesson_id, lesson = next( + (video_id, data) + for video_id, data in course['lessonData'].items() + if data.get('slug') == lesson_name) + + chapters = self._extract_chapters(course) + return self._extract_lesson(chapters, lesson_id, lesson) + + +class FrontendMastersCourseIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/', + 'info_dict': { + 'id': 'web-development', + 'title': 'Introduction to Web Development', + 'description': 'md5:9317e6e842098bf725d62360e52d49a6', + }, + 'playlist_count': 81, + 'skip': 'Requires FrontendMasters account credentials', + } + + @classmethod + def suitable(cls, url): + return False if FrontendMastersLessonIE.suitable(url) else super( + FrontendMastersBaseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + course = self._download_course(course_name, url) + + chapters = self._extract_chapters(course) + + lessons = sorted( + course['lessonData'].values(), key=lambda data: data['index']) + + entries = [] + for lesson in lessons: + lesson_name = lesson.get('slug') + if not lesson_name: + continue + lesson_id = lesson.get('hash') or lesson.get('statsId') + entries.append(self._extract_lesson(chapters, lesson_id, lesson)) + + title = course.get('title') + description = course.get('description') + + return self.playlist_result(entries, course_name, title, description) From e06632e3fe25036b804a62469bb18fa4c37e3368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Jul 2018 08:22:56 +0700 Subject: [PATCH 058/111] [downloader/dash] Improve error handling (#16927) --- youtube_dl/downloader/dash.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 576ece6db..eaa7adf7c 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .fragment import FragmentFD from ..compat import compat_urllib_error -from ..utils import urljoin +from ..utils import ( + DownloadError, + urljoin, +) class DashSegmentsFD(FragmentFD): @@ -57,6 +60,14 @@ class DashSegmentsFD(FragmentFD): count += 1 if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) + except DownloadError: + # Don't retry fragment if error occurred during HTTP downloading + # itself since it has own retry settings + if not fatal: + self.report_skip_fragment(frag_index) + break + raise + if count > fragment_retries: if not fatal: self.report_skip_fragment(frag_index) From 0685d9727b9657fc8a31c96cb52c4155de29fcfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Jul 2018 23:43:05 +0700 Subject: [PATCH 059/111] [utils] Share JSON-LD regex --- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/utils.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78f053f18..5d4db54d5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -52,6 +52,7 @@ from ..utils import ( GeoUtils, int_or_none, js_to_json, + JSON_LD_RE, mimetype2ext, orderedSet, parse_codecs, @@ -1149,8 +1150,7 @@ class InfoExtractor(object): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( - r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', - html, 'JSON-LD', group='json_ld', **kwargs) + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) if not json_ld: return default if default is not NO_DEFAULT else {} diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6a3199fb9..8c45166d7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -184,6 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' def preferredencoding(): From 79fd7320e24596b39d81c2a364fb5b41c2f57b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Jul 2018 23:44:05 +0700 Subject: [PATCH 060/111] [nrktv] Add support for new episode URL schema (closes #16909) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 38 +++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ed532d77f..a20712d34 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -768,6 +768,7 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKTVEpisodeIE, NRKTVEpisodesIE, NRKTVSeriesIE, ) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7157e2390..50dd07d11 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -8,6 +8,7 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + JSON_LD_RE, parse_age_limit, parse_duration, ) @@ -359,6 +360,40 @@ class NRKTVIE(NRKBaseIE): }] +class NRKTVEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', + 'info_dict': { + 'id': 'MSUI14000816AA', + 'ext': 'mp4', + 'title': 'Backstage 8:30', + 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', + 'duration': 1320, + 'series': 'Backstage', + 'season_number': 1, + 'episode_number': 8, + 'episode': '8:30', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + nrk_id = self._parse_json( + self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'), + display_id)['@id'] + + assert re.match(NRKTVIE._EPISODE_RE, nrk_id) + return self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) + + class NRKTVDirekteIE(NRKTVIE): IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' @@ -470,7 +505,8 @@ class NRKTVSeriesIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url) + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): series_id = self._match_id(url) From 4b3ee09886d1f2a096004013e6a8a13a1f564ba8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Jul 2018 00:21:14 +0700 Subject: [PATCH 061/111] [nrktv] Add support for new season and serie URL schema --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 208 ++++++++++++++++++++--------- 2 files changed, 149 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a20712d34..c6f8a785a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -770,6 +770,7 @@ from .nrk import ( NRKTVDirekteIE, NRKTVEpisodeIE, NRKTVEpisodesIE, + NRKTVSeasonIE, NRKTVSeriesIE, ) from .ntvde import NTVDeIE diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 50dd07d11..a231735fb 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,13 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, int_or_none, JSON_LD_RE, + NO_DEFAULT, parse_age_limit, parse_duration, + try_get, ) @@ -394,6 +399,148 @@ class NRKTVEpisodeIE(InfoExtractor): 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) +class NRKTVSerieBaseIE(InfoExtractor): + def _extract_series(self, webpage, display_id, fatal=True): + config = self._parse_json( + self._search_regex( + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config', + default='{}' if not fatal else NO_DEFAULT), + display_id, fatal=False) + if not config: + return + return try_get(config, lambda x: x['series'], dict) + + def _extract_episodes(self, season): + entries = [] + if not isinstance(season, dict): + return entries + episodes = season.get('episodes') + if not isinstance(episodes, list): + return entries + for episode in episodes: + nrk_id = episode.get('prfId') + if not nrk_id or not isinstance(nrk_id, compat_str): + continue + entries.append(self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) + return entries + + +class NRKTVSeasonIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', + 'info_dict': { + 'id': '1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 30, + } + + @classmethod + def suitable(cls, url): + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + else super(NRKTVSeasonIE, cls).suitable(url)) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + series = self._extract_series(webpage, display_id) + + season = next( + s for s in series['seasons'] + if int(display_id) == s.get('seasonNumber')) + + title = try_get(season, lambda x: x['titles']['title'], compat_str) + return self.playlist_result( + self._extract_episodes(season), display_id, title) + + +class NRKTVSeriesIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' + _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' + _TESTS = [{ + # new layout + 'url': 'https://tv.nrk.no/serie/backstage', + 'info_dict': { + 'id': 'backstage', + 'title': 'Backstage', + 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + }, + 'playlist_mincount': 60, + }, { + # old layout + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'md5:58afd450974c89e27d5a19212eee7115', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return ( + False if any(ie.suitable(url) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + else super(NRKTVSeriesIE, cls).suitable(url)) + + def _real_extract(self, url): + series_id = self._match_id(url) + + webpage = self._download_webpage(url, series_id) + + # New layout (e.g. https://tv.nrk.no/serie/backstage) + series = self._extract_series(webpage, series_id, fatal=False) + if series: + title = try_get(series, lambda x: x['titles']['title'], compat_str) + description = try_get( + series, lambda x: x['titles']['subtitle'], compat_str) + entries = [] + for season in series['seasons']: + entries.extend(self._extract_episodes(season)) + return self.playlist_result(entries, series_id, title, description) + + # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) + entries = [ + self.url_result( + 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, season=season_id)) + for season_id in re.findall(self._ITEM_RE, webpage) + ] + + title = self._html_search_meta( + 'seriestitle', webpage, + 'title', default=None) or self._og_search_title( + webpage, fatal=False) + + description = self._html_search_meta( + 'series_description', webpage, + 'description', default=None) or self._og_search_description(webpage) + + return self.playlist_result(entries, series_id, title, description) + + class NRKTVDirekteIE(NRKTVIE): IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' @@ -473,65 +620,6 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) -class NRKTVSeriesIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 9, - }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', - 'info_dict': { - 'id': 'labyrint', - 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', - }, - 'playlist_mincount': 3, - }, { - 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/saving-the-human-race', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/postmann-pat', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) - else super(NRKTVSeriesIE, cls).suitable(url)) - - def _real_extract(self, url): - series_id = self._match_id(url) - - webpage = self._download_webpage(url, series_id) - - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) - - class NRKSkoleIE(InfoExtractor): IE_DESC = 'NRK Skole' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' From 7e8e948cf7eb17de57e95f20d1b3cb963f46f121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Jul 2018 02:08:15 +0700 Subject: [PATCH 062/111] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index c33bf7777..978a316bc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version <unreleased> + +Core +* [utils] Share JSON-LD regular expression +* [downloader/dash] Improve error handling (#16927) + +Extractors ++ [nrktv] Add support for new season and serie URL schema ++ [nrktv] Add support for new episode URL schema (#16909) ++ [frontendmasters] Add support for frontendmasters.com (#3661, #16328) +* [funk] Fix extraction (#16918) +* [watchbox] Fix extraction (#16904) +* [dplayit] Sort formats +* [dplayit] Fix extraction (#16901) +* [youtube] Improve login error handling (#13822) + + version 2018.07.04 Core From 40a051fa9f48000f311f243c40e3cae588420738 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Jul 2018 02:09:51 +0700 Subject: [PATCH 063/111] release 2018.07.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 453983f84..f192c6633 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.07.04 +[debug] youtube-dl version 2018.07.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 978a316bc..1d602079e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.10 Core * [utils] Share JSON-LD regular expression diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 19dc984dc..6cbe81802 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -302,6 +302,9 @@ - **Freesound** - **freespeech.org** - **FreshLive** + - **FrontendMasters** + - **FrontendMastersCourse** + - **FrontendMastersLesson** - **Funimation** - **FunkChannel** - **FunkMix** @@ -589,7 +592,9 @@ - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte + - **NRKTVEpisode** - **NRKTVEpisodes** + - **NRKTVSeason** - **NRKTVSeries** - **ntv.ru** - **Nuvid** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4cf97291b..c7083cf47 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.04' +__version__ = '2018.07.10' From 79367a98208fbf01d6e04b6747a6e01d0b1f8b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Jul 2018 18:05:06 +0700 Subject: [PATCH 064/111] [pornhub] Improve extraction and extract all formats (closes #12166, closes #15891, closes #16262, closes #16959) --- youtube_dl/extractor/pornhub.py | 125 ++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 23e24d216..97f988da4 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals import functools import itertools import operator -# import os import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - # compat_urllib_parse_unquote, - # compat_urllib_parse_unquote_plus, - # compat_urllib_parse_urlparse, + compat_str, ) from ..utils import ( ExtractorError, int_or_none, js_to_json, orderedSet, - # sanitized_Request, remove_quotes, str_to_int, ) -# from ..aes import ( -# aes_decrypt_text -# ) class PornHubIE(InfoExtractor): @@ -62,7 +55,7 @@ class PornHubIE(InfoExtractor): 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', - 'uploader': 'cj397186295', + 'uploader': 'Unknown', 'duration': 1753, 'view_count': int, 'like_count': int, @@ -121,7 +114,7 @@ class PornHubIE(InfoExtractor): self._set_cookie('pornhub.com', 'platform', platform) return self._download_webpage( 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, - video_id) + video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') @@ -134,48 +127,19 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) - tv_webpage = dl_webpage('tv') - - assignments = self._search_regex( - r'(var.+?mediastring.+?)</script>', tv_webpage, - 'encoded url').split(';') - - js_vars = {} - - def parse_js_value(inp): - inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) - if '+' in inp: - inps = inp.split('+') - return functools.reduce( - operator.concat, map(parse_js_value, inps)) - inp = inp.strip() - if inp in js_vars: - return js_vars[inp] - return remove_quotes(inp) - - for assn in assignments: - assn = assn.strip() - if not assn: - continue - assn = re.sub(r'var\s+', '', assn) - vname, value = assn.split('=', 1) - js_vars[vname] = parse_js_value(value) - - video_url = js_vars['mediastring'] - - title = self._search_regex( - r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) - # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. - title = title or self._html_search_meta( + title = self._html_search_meta( 'twitter:title', webpage, default=None) or self._search_regex( (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), webpage, 'title', group='title') + video_urls = [] + video_urls_set = set() + flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), @@ -183,8 +147,78 @@ class PornHubIE(InfoExtractor): if flashvars: thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) else: - title, thumbnail, duration = [None] * 3 + thumbnail, duration = [None] * 2 + + if not video_urls: + tv_webpage = dl_webpage('tv') + + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + formats = [] + for video_url, height in video_urls: + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', @@ -210,7 +244,6 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'uploader': video_uploader, 'title': title, 'thumbnail': thumbnail, @@ -219,7 +252,7 @@ class PornHubIE(InfoExtractor): 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - # 'formats': formats, + 'formats': formats, 'age_limit': 18, 'tags': tags, 'categories': categories, From 905eef2b06f1e890b1dfd228aa4fa1fa2308d687 Mon Sep 17 00:00:00 2001 From: Jakub Wilk <jwilk@jwilk.net> Date: Wed, 18 Jul 2018 18:47:26 +0200 Subject: [PATCH 065/111] [imgur] Allow digits in filename extension --- youtube_dl/extractor/imgur.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 2901960a5..ecc958a17 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -43,6 +43,9 @@ class ImgurIE(InfoExtractor): }, { 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, + }, { + 'url': 'https://i.imgur.com/crGpqCV.mp4', + 'only_matching': True, }] def _real_extract(self, url): From bd21ead2a20ff16ec8cb10da72526103471069d6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:29:18 +0100 Subject: [PATCH 066/111] [extractor/common] add support for DASH and MSS formats extraction in SMIL manifests --- youtube_dl/extractor/common.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5d4db54d5..b8bbaf81a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1859,9 +1859,7 @@ class InfoExtractor(object): 'height': height, }) formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': + elif src_ext == 'f4m': f4m_url = src_url if not f4m_params: f4m_params = { @@ -1871,9 +1869,13 @@ class InfoExtractor(object): f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1884,7 +1886,6 @@ class InfoExtractor(object): 'width': width, 'height': height, }) - continue return formats From 371dcc1dd4b29001910005c1d3e416db204cc262 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:31:40 +0100 Subject: [PATCH 067/111] [theplatform] add support for theplatform Top-level domain customization(#16977) --- youtube_dl/extractor/theplatform.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index b1a985ff6..e7dc6071c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -32,13 +32,14 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): + _TP_TLD = 'com' def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml( smil_url, video_id, note=note, query={'format': 'SMIL'}, headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') if error_element is not None and error_element.attrib['src'].startswith( - 'http://link.theplatform.com/s/errorFiles/Unavailable.'): + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD): raise ExtractorError(error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( @@ -66,7 +67,7 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles def _download_theplatform_metadata(self, path, video_id): - info_url = 'http://link.theplatform.com/s/%s?format=preview' % path + info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) return self._download_json(info_url, video_id) def _parse_theplatform_metadata(self, info): From 38f1eb0ac3be5d3b61e7722db0024e61cf98eb69 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:33:33 +0100 Subject: [PATCH 068/111] [mediaset] fix extraction(closes #16977) --- youtube_dl/extractor/mediaset.py | 155 +++++++++++++++---------------- 1 file changed, 74 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9f2b60dcc..57f97409d 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -3,75 +3,75 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..compat import compat_str +from .theplatform import ThePlatformBaseIE from ..utils import ( - determine_ext, - parse_duration, - try_get, - unified_strdate, + ExtractorError, + int_or_none, + update_url_query, ) -class MediasetIE(InfoExtractor): +class MediasetIE(ThePlatformBaseIE): + _TP_TLD = 'eu' _VALID_URL = r'''(?x) (?: mediaset:| https?:// - (?:www\.)?video\.mediaset\.it/ + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: (?:video|on-demand)/(?:[^/]+/)+[^/]+_| - player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + player/index\.html\?.*?\bprogramGuid= ) - )(?P<id>[0-9]+) + )(?P<id>[0-9A-Z]{16}) ''' _TESTS = [{ # full episode - 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824', 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'info_dict': { - 'id': '661824', + 'id': 'FAFU000000661824', 'ext': 'mp4', 'title': 'Quarta puntata', - 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1414, - 'creator': 'mediaset', + 'duration': 1414.26, 'upload_date': '20161107', 'series': 'Hello Goodbye', - 'categories': ['reality'], + 'timestamp': 1478532900, + 'uploader': 'Rete 4', + 'uploader_id': 'R4', }, - 'expected_warnings': ['is not a supported codec'], }, { - 'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', - 'md5': '1276f966ac423d16ba255ce867de073e', + 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', + 'md5': '288532f0ad18307705b01e581304cd7b', 'info_dict': { - 'id': '846685', + 'id': 'F309013801000501', 'ext': 'mp4', 'title': 'Puntata del 25 maggio', - 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 6565, - 'creator': 'mediaset', - 'upload_date': '20180525', + 'duration': 6565.007, + 'upload_date': '20180526', 'series': 'Matrix', - 'categories': ['infotainment'], + 'timestamp': 1527326245, + 'uploader': 'Canale 5', + 'uploader_id': 'C5', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip - 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', 'only_matching': True, }, { # iframe simple - 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', 'only_matching': True, }, { # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) - 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, }, { - 'url': 'mediaset:661824', + 'url': 'mediaset:FAFU000000665924', 'only_matching': True, }] @@ -84,61 +84,54 @@ class MediasetIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'https://www.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading media info', query={ - 'id': video_id - })['video'] - - title = video['title'] - media_id = video.get('guid') or video_id - - video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', - video_id, 'Downloading video CDN JSON', query={ - 'streamid': media_id, - 'format': 'json', - })['videoList'] + guid = self._match_id(url) + tp_path = 'PR1GhC/media/guid/2702976343/' + guid + info = self._extract_theplatform_metadata(tp_path, guid) formats = [] - for format_url in video_list: - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - elif ext == 'ism' or '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': determine_ext(format_url), - }) + subtitles = {} + first_e = None + for asset_type in ('SD', 'HD'): + for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'): + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { + 'mbr': 'true', + 'formats': f, + 'assetTypes': asset_type, + }), guid, 'Downloading %s %s SMIL data' % (f, asset_type)) + except ExtractorError as e: + if not first_e: + first_e = e + break + for tp_f in tp_formats: + tp_f['quality'] = 1 if asset_type == 'HD' else 0 + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if first_e and not formats: + raise first_e self._sort_formats(formats) - creator = try_get( - video, lambda x: x['brand-info']['publisher'], compat_str) - category = try_get( - video, lambda x: x['brand-info']['category'], compat_str) - categories = [category] if category else None + fields = [] + for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))): + fields.extend(templ % repl for repl in repls) + feed_data = self._download_json( + 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid, + guid, fatal=False, query={'fields': ','.join(fields)}) + if feed_data: + publish_info = feed_data.get('mediasetprogram$publishInfo') or {} + info.update({ + 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), + 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), + 'series': feed_data.get('mediasetprogram$brandTitle'), + 'uploader': publish_info.get('description'), + 'uploader_id': publish_info.get('channel'), + 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), + }) - return { - 'id': video_id, - 'title': title, - 'description': video.get('short-description'), - 'thumbnail': video.get('thumbnail'), - 'duration': parse_duration(video.get('duration')), - 'creator': creator, - 'upload_date': unified_strdate(video.get('production-date')), - 'webpage_url': video.get('url'), - 'series': video.get('brand-value'), - 'season': video.get('season'), - 'categories': categories, + info.update({ + 'id': guid, 'formats': formats, - } + 'subtitles': subtitles, + }) + return info From c63f5fb8633dda1b0a673c90d537e93497a8d62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 01:59:00 +0700 Subject: [PATCH 069/111] [slutload] Fix and improve extraction (closes #17001) --- youtube_dl/extractor/slutload.py | 57 +++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 6fc2ff60d..661f9e59d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class SlutloadIE(InfoExtractor): - _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', @@ -16,33 +14,52 @@ class SlutloadIE(InfoExtractor): 'title': 'virginie baisee en cam', 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' - } + }, }, { # mobile site 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) - video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', - webpage, 'title').strip() + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') - video_url = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', - webpage, 'thumbnail', fatal=False) + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } - return { + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': 18 - } + 'title': title, + 'age_limit': 18, + }) + return info From 8da17f96803faa35ab19352bdcd2777011d8812a Mon Sep 17 00:00:00 2001 From: bato3 <bato3@bandyci.org> Date: Wed, 18 Jul 2018 21:04:05 +0200 Subject: [PATCH 070/111] [dailymotion] Improve description extraction (closes #16984) --- youtube_dl/extractor/dailymotion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9a74906cb..8f5f57b98 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -144,7 +144,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): age_limit = self._rta_search(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( 'description', webpage, 'description') view_count_str = self._search_regex( From 11330f5121732f80e3d6ba1c34102955427eb04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 02:25:19 +0700 Subject: [PATCH 071/111] [facebook] Extract view count and update tests (closes #16942) --- youtube_dl/extractor/facebook.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8a9ed96c2..f78479b92 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -20,6 +20,7 @@ from ..utils import ( int_or_none, js_to_json, limit_length, + parse_count, sanitized_Request, try_get, urlencode_postdata, @@ -75,7 +76,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt posted a video to his Timeline.', + 'title': 're:^Asif Nawab Butt posted a video', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, @@ -133,7 +134,7 @@ class FacebookIE(InfoExtractor): }, { # have 1080P, but only up to 720p in swf params 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '0d9813160b146b3bc8744e006027fcc6', + 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -142,6 +143,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20161030', 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', + 'view_count': int, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -149,7 +151,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'md5:a7b86ca673f51800cd54687b7f4012fe', + 'title': 'md5:1db063d6a8c13faa8da727817339c857', 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', @@ -176,7 +178,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1396382447100162', 'ext': 'mp4', - 'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3', + 'title': 'md5:19a428bbde91364e3de815383b54a235', 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', @@ -426,6 +428,10 @@ class FacebookIE(InfoExtractor): 'timestamp', default=None)) thumbnail = self._og_search_thumbnail(webpage) + view_count = parse_count(self._search_regex( + r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None)) + info_dict = { 'id': video_id, 'title': video_title, @@ -433,6 +439,7 @@ class FacebookIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'thumbnail': thumbnail, + 'view_count': view_count, } return webpage, info_dict From 6fc09f0155bae1fd0d3edf31111b37012875b6f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 23:14:20 +0700 Subject: [PATCH 072/111] [vimeo] Add another config regex (closes #17013) --- youtube_dl/extractor/vimeo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 3baa2d075..e49b233f2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -539,9 +539,10 @@ class VimeoIE(VimeoBaseInfoExtractor): # We try to find out to which variable is assigned the config dic m_variable_name = re.search(r'(\w)\.video\.id', webpage) if m_variable_name is not None: - config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) + config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') config = self._search_regex(config_re, webpage, 'info section', flags=re.DOTALL) config = json.loads(config) From c258570eddeafdef23221326e1961c81934f5297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 00:01:43 +0700 Subject: [PATCH 073/111] [viu] Pass Referer and Origin headers (closes #16992) --- youtube_dl/extractor/viu.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index 5cf93591c..e268f9409 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -214,6 +214,9 @@ class ViuOTTIE(InfoExtractor): 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, video_id, 'Downloading stream info', query={ 'ccs_product_id': video_data['ccs_product_id'], + }, headers={ + 'Referer': url, + 'Origin': re.search(r'https?://[^/]+', url).group(0), })['data']['stream'] stream_sizes = stream_data.get('size', {}) From ecb6b6ae2df15cb777fbffdfb8058affa8918f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 00:46:50 +0700 Subject: [PATCH 074/111] [viu] Pass area id --- youtube_dl/extractor/viu.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index e268f9409..3bd37525b 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -195,16 +195,29 @@ class ViuOTTIE(InfoExtractor): 'skip': 'Geo-restricted to Hong Kong', }] + _AREA_ID = { + 'HK': 1, + 'SG': 2, + 'TH': 4, + 'PH': 5, + } + def _real_extract(self, url): country_code, video_id = re.match(self._VALID_URL, url).groups() + query = { + 'r': 'vod/ajax-detail', + 'platform_flag_label': 'web', + 'product_id': video_id, + } + + area_id = self._AREA_ID.get(country_code.upper()) + if area_id: + query['area_id'] = area_id + product_data = self._download_json( 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, - 'Downloading video info', query={ - 'r': 'vod/ajax-detail', - 'platform_flag_label': 'web', - 'product_id': video_id, - })['data'] + 'Downloading video info', query=query)['data'] video_data = product_data.get('current_product') if not video_data: From 25586c601c46768271070c61af76e7fa1d196890 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 00:48:50 +0700 Subject: [PATCH 075/111] [theplatform] PEP 8 [ci skip] --- youtube_dl/extractor/theplatform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index e7dc6071c..411b1f874 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -33,6 +33,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): _TP_TLD = 'com' + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml( smil_url, video_id, note=note, query={'format': 'SMIL'}, From fd62b36680ff7d7bea789ac0031a33fc2d9270ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 02:39:20 +0700 Subject: [PATCH 076/111] [vrtnu] Relax title extraction and extract JSON-LD (closes #17018) --- youtube_dl/extractor/canvas.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8ac62c1a6..174fd9e2b 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -11,6 +11,7 @@ from ..utils import ( strip_or_none, float_or_none, int_or_none, + merge_dicts, parse_iso8601, ) @@ -248,9 +249,13 @@ class VrtNUIE(GigyaBaseIE): webpage, urlh = self._download_webpage_handle(url, display_id) - title = self._html_search_regex( + info = self._search_json_ld(webpage, display_id, default={}) + + # title is optional here since it may be extracted by extractor + # that is delegated from here + title = strip_or_none(self._html_search_regex( r'(?ms)<h1 class="content__heading">(.+?)</h1>', - webpage, 'title').strip() + webpage, 'title', default=None)) description = self._html_search_regex( r'(?ms)<div class="content__description">(.+?)</div>', @@ -295,7 +300,7 @@ class VrtNUIE(GigyaBaseIE): # the first one video_id = list(video.values())[0].get('videoid') - return { + return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), @@ -307,4 +312,4 @@ class VrtNUIE(GigyaBaseIE): 'season_number': season_number, 'episode_number': episode_number, 'release_date': release_date, - } + }) From e9c671d5e86e43785382ae9cb20c8e7676c7c9bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 12:30:18 +0700 Subject: [PATCH 077/111] [utils] Allow JSONP with empty func name (closes #17028) --- test/test_utils.py | 4 ++++ youtube_dl/utils.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index e63af0166..de841b1a0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -717,6 +717,10 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) + stripped = strip_jsonp('({"status": "success"});') + d = json.loads(stripped) + self.assertEqual(d, {'status': 'success'}) + def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8c45166d7..b8700efcb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2282,7 +2282,7 @@ def parse_age_limit(s): def strip_jsonp(code): return re.sub( r'''(?sx)^ - (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+) + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) (?:\s*&&\s*(?P=func_name))? \s*\(\s*(?P<callback_data>.*)\);? \s*?(?://[^\n]*)*$''', From edb0e17188a1197afae1a0d594e4aab7d27bbcf2 Mon Sep 17 00:00:00 2001 From: Kazuma Takahara <4269kzm@gmail.com> Date: Sat, 21 Jul 2018 19:41:33 +0900 Subject: [PATCH 078/111] [iwara] Fix download URLs (closes #17026) --- youtube_dl/extractor/iwara.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index a7514fc80..250140d91 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -77,7 +77,7 @@ class IwaraIE(InfoExtractor): height = int_or_none(self._search_regex( r'(\d+)p', format_id, 'height', default=None)) formats.append({ - 'url': a_format['uri'], + 'url': self._proto_relative_url(a_format['uri'], 'https:'), 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, From b96b4be4619b1e090650212380a92fb068f2fd21 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 21 Jul 2018 11:49:55 +0100 Subject: [PATCH 079/111] [bbc] add support for BBC Radio Play pages(closes #17022) --- youtube_dl/extractor/bbc.py | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 293d82b0f..641bf6073 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -778,6 +778,17 @@ class BBCIE(BBCCoUkIE): 'params': { 'skip_download': True, } + }, { + # window.__PRELOADED_STATE__ + 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', + 'info_dict': { + 'id': 'b0b9z4vz', + 'ext': 'mp4', + 'title': 'Prom 6: An American in Paris and Turangalila', + 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', + 'uploader': 'Radio 3', + 'uploader_id': 'bbc_radio_three', + }, }] @classmethod @@ -1000,6 +1011,36 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + preload_state = self._parse_json(self._search_regex( + r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if preload_state: + current_programme = preload_state.get('programmes', {}).get('current') or {} + programme_id = current_programme.get('id') + if current_programme and programme_id and current_programme.get('type') == 'playable_item': + title = current_programme.get('titles', {}).get('tertiary') or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + synopses = current_programme.get('synopses') or {} + network = current_programme.get('network') or {} + duration = int_or_none( + current_programme.get('duration', {}).get('value')) + thumbnail = None + image_url = current_programme.get('image_url') + if image_url: + thumbnail = image_url.replace('{recipe}', '1920x1920') + return { + 'id': programme_id, + 'title': title, + 'description': dict_get(synopses, ('long', 'medium', 'short')), + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': network.get('short_title'), + 'uploader_id': network.get('id'), + 'formats': formats, + 'subtitles': subtitles, + } + bbc3_config = self._parse_json( self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, From af03000ad5a445f03fbacb63ce626f8dcfe785c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 18:01:06 +0700 Subject: [PATCH 080/111] [utils] Introduce url_or_none --- test/test_utils.py | 11 +++++++++++ youtube_dl/utils.py | 7 +++++++ 2 files changed, 18 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index de841b1a0..8da5ccc56 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -78,6 +78,7 @@ from youtube_dl.utils import ( uppercase_escape, lowercase_escape, url_basename, + url_or_none, base_url, urljoin, urlencode_postdata, @@ -507,6 +508,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + def test_url_or_none(self): + self.assertEqual(url_or_none(None), None) + self.assertEqual(url_or_none(''), None) + self.assertEqual(url_or_none('foo'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de') + self.assertEqual(url_or_none('http$://foo.de'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('//foo.de'), '//foo.de') + def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) self.assertEqual(parse_age_limit(False), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b8700efcb..b84436ed6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1866,6 +1866,13 @@ def strip_or_none(v): return None if v is None else v.strip() +def url_or_none(url): + if not url or not isinstance(url, compat_str): + return None + url = url.strip() + return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + + def parse_duration(s): if not isinstance(s, compat_basestring): return None From 4ecf300d13a6503ae80b76e01047b41d86ab4d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 18:02:41 +0700 Subject: [PATCH 081/111] [iwara] Improve extraction --- youtube_dl/extractor/iwara.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 250140d91..907d5fc8b 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, mimetype2ext, remove_end, + url_or_none, ) @@ -73,11 +74,14 @@ class IwaraIE(InfoExtractor): formats = [] for a_format in video_data: + format_uri = url_or_none(a_format.get('uri')) + if not format_uri: + continue format_id = a_format.get('resolution') height = int_or_none(self._search_regex( r'(\d+)p', format_id, 'height', default=None)) formats.append({ - 'url': self._proto_relative_url(a_format['uri'], 'https:'), + 'url': self._proto_relative_url(format_uri, 'https:'), 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, From 3052a30d4259b182904e5d2430077039461745bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 19:08:28 +0700 Subject: [PATCH 082/111] Improve URL extraction --- youtube_dl/extractor/adultswim.py | 3 ++- youtube_dl/extractor/afreecatv.py | 3 ++- youtube_dl/extractor/amp.py | 15 ++++++++------- youtube_dl/extractor/animeondemand.py | 3 ++- youtube_dl/extractor/aol.py | 3 ++- youtube_dl/extractor/apa.py | 6 +++--- youtube_dl/extractor/aparat.py | 3 ++- youtube_dl/extractor/ard.py | 4 ++-- youtube_dl/extractor/bandcamp.py | 7 ++++--- youtube_dl/extractor/breakcom.py | 10 ++++++---- youtube_dl/extractor/cammodels.py | 6 +++--- youtube_dl/extractor/ccma.py | 6 +++--- youtube_dl/extractor/crackle.py | 14 ++++++-------- youtube_dl/extractor/dctp.py | 7 ++++--- youtube_dl/extractor/discoverygo.py | 7 +++---- youtube_dl/extractor/dramafever.py | 10 +++++----- youtube_dl/extractor/eagleplatform.py | 8 +++----- youtube_dl/extractor/egghead.py | 8 +++++--- youtube_dl/extractor/eporner.py | 5 +++-- youtube_dl/extractor/firsttv.py | 5 +++-- youtube_dl/extractor/francetv.py | 8 ++++---- youtube_dl/extractor/frontendmasters.py | 3 ++- youtube_dl/extractor/generic.py | 5 +++-- youtube_dl/extractor/hidive.py | 10 +++++----- youtube_dl/extractor/imdb.py | 6 +++--- youtube_dl/extractor/instagram.py | 3 ++- youtube_dl/extractor/itv.py | 5 +++-- youtube_dl/extractor/keezmovies.py | 9 ++++----- youtube_dl/extractor/konserthusetplay.py | 5 +++-- youtube_dl/extractor/mediasite.py | 5 +++-- youtube_dl/extractor/peertube.py | 5 +++-- youtube_dl/extractor/redtube.py | 6 +++--- youtube_dl/extractor/rentv.py | 5 +++-- youtube_dl/extractor/rutube.py | 5 +++-- youtube_dl/extractor/turner.py | 5 +++-- youtube_dl/extractor/tvnet.py | 7 +++---- youtube_dl/extractor/tvplay.py | 4 +++- youtube_dl/extractor/twitch.py | 5 +++-- youtube_dl/extractor/udemy.py | 13 +++++++------ youtube_dl/extractor/vidme.py | 10 ++++------ youtube_dl/extractor/vk.py | 4 +++- youtube_dl/extractor/xhamster.py | 7 +++++-- youtube_dl/extractor/yapfiles.py | 8 ++++---- youtube_dl/extractor/youjizz.py | 6 +++--- youtube_dl/extractor/youporn.py | 6 +++--- youtube_dl/extractor/zattoo.py | 5 +++-- youtube_dl/extractor/zdf.py | 12 +++++++----- 47 files changed, 166 insertions(+), 139 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index acc4ce38d..88c96a950 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -7,6 +7,7 @@ from .turner import TurnerBaseIE from ..utils import ( int_or_none, strip_or_none, + url_or_none, ) @@ -98,7 +99,7 @@ class AdultSwimIE(TurnerBaseIE): if not video_id: entries = [] for episode in video_data.get('archiveEpisodes', []): - episode_url = episode.get('url') + episode_url = url_or_none(episode.get('url')) if not episode_url: continue entries.append(self.url_result( diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 4b3d97136..6275e5209 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + url_or_none, urlencode_postdata, xpath_text, ) @@ -304,7 +305,7 @@ class AfreecaTVIE(InfoExtractor): file_elements = video_element.findall(compat_xpath('./file')) one = len(file_elements) == 1 for file_num, file_element in enumerate(file_elements, start=1): - file_url = file_element.text + file_url = url_or_none(file_element.text) if not file_url: continue key = file_element.get('key', '') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index fde1a8ff7..7ff098cfa 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -3,11 +3,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - int_or_none, - parse_iso8601, - mimetype2ext, determine_ext, ExtractorError, + int_or_none, + mimetype2ext, + parse_iso8601, + url_or_none, ) @@ -35,7 +36,7 @@ class AMPIE(InfoExtractor): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: thumbnail = thumbnail_data.get('@attributes', {}) - thumbnail_url = thumbnail.get('url') + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ @@ -51,7 +52,7 @@ class AMPIE(InfoExtractor): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: subtitle = subtitle_data.get('@attributes', {}) - subtitle_href = subtitle.get('href') + subtitle_href = url_or_none(subtitle.get('href')) if not subtitle_href: continue subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ @@ -65,7 +66,7 @@ class AMPIE(InfoExtractor): media_content = [media_content] for media_data in media_content: media = media_data.get('@attributes', {}) - media_url = media.get('url') + media_url = url_or_none(media.get('url')) if not media_url: continue ext = mimetype2ext(media.get('type')) or determine_ext(media_url) @@ -79,7 +80,7 @@ class AMPIE(InfoExtractor): else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), - 'url': media['url'], + 'url': media_url, 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), 'ext': ext, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 1fe5d5e56..00ce684d1 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + url_or_none, urlencode_postdata, urljoin, ) @@ -165,7 +166,7 @@ class AnimeOnDemandIE(InfoExtractor): }, fatal=False) if not playlist: continue - stream_url = playlist.get('streamurl') + stream_url = url_or_none(playlist.get('streamurl')) if stream_url: rtmp = re.search( r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b50f454ee..cb9279193 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + url_or_none, ) @@ -77,7 +78,7 @@ class AolIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) for rendition in video_data.get('renditions', []): - video_url = rendition.get('url') + video_url = url_or_none(rendition.get('url')) if not video_url: continue ext = rendition.get('format') diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index a30a935aa..98ccdaa4a 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, js_to_json, + url_or_none, ) @@ -68,8 +68,8 @@ class APAIE(InfoExtractor): for source in sources: if not isinstance(source, dict): continue - source_url = source.get('file') - if not source_url or not isinstance(source_url, compat_str): + source_url = url_or_none(source.get('file')) + if not source_url: continue ext = determine_ext(source_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index e394cb661..6eb8bbb6e 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, mimetype2ext, + url_or_none, ) @@ -43,7 +44,7 @@ class AparatIE(InfoExtractor): formats = [] for item in file_list[0]: - file_url = item.get('file') + file_url = url_or_none(item.get('file')) if not file_url: continue ext = mimetype2ext(item.get('type')) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 86951d975..23f574d36 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from .generic import GenericIE -from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, @@ -15,6 +14,7 @@ from ..utils import ( unified_strdate, xpath_text, update_url_query, + url_or_none, ) from ..compat import compat_etree_fromstring @@ -100,7 +100,7 @@ class ARDMediathekIE(InfoExtractor): quality = stream.get('_quality') server = stream.get('_server') for stream_url in stream_urls: - if not isinstance(stream_url, compat_str) or '//' not in stream_url: + if not url_or_none(stream_url): continue ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index be41bd5a2..b8514734d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -19,6 +19,7 @@ from ..utils import ( unescapeHTML, update_url_query, unified_strdate, + url_or_none, ) @@ -131,8 +132,8 @@ class BandcampIE(InfoExtractor): fatal=False) if not stat: continue - retry_url = stat.get('retry_url') - if not isinstance(retry_url, compat_str): + retry_url = url_or_none(stat.get('retry_url')) + if not retry_url: continue formats.append({ 'url': self._proto_relative_url(retry_url, 'http:'), @@ -306,7 +307,7 @@ class BandcampWeeklyIE(InfoExtractor): formats = [] for format_id, format_url in show['audio_stream'].items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue for known_ext in KNOWN_EXTENSIONS: if known_ext in format_id: diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 70d16767f..68c7cf2bb 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + url_or_none, +) class BreakIE(InfoExtractor): @@ -55,8 +57,8 @@ class BreakIE(InfoExtractor): formats = [] for video in content: - video_url = video.get('url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video.get('url')) + if not video_url: continue bitrate = int_or_none(self._search_regex( r'(\d+)_kbps', video_url, 'tbr', default=None)) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index ee0165dba..79350817f 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -2,10 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + url_or_none, ) @@ -56,8 +56,8 @@ class CamModelsIE(InfoExtractor): for media in encodings: if not isinstance(media, dict): continue - media_url = media.get('location') - if not media_url or not isinstance(media_url, compat_str): + media_url = url_or_none(media.get('location')) + if not media_url: continue format_id_list = [format_id] diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 07f5206c1..544647f92 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, int_or_none, parse_duration, parse_iso8601, parse_resolution, + url_or_none, ) @@ -53,8 +53,8 @@ class CCMAIE(InfoExtractor): media_url = media['media']['url'] if isinstance(media_url, list): for format_ in media_url: - format_url = format_.get('file') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_.get('file')) + if not format_url: continue label = format_.get('label') f = parse_resolution(label) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f4a616455..8dd9d6687 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -4,16 +4,14 @@ from __future__ import unicode_literals, division import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, float_or_none, int_or_none, parse_age_limit, parse_duration, + url_or_none, ExtractorError ) @@ -86,8 +84,8 @@ class CrackleIE(InfoExtractor): for e in media['MediaURLs']: if e.get('UseDRM') is True: continue - format_url = e.get('Path') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(e.get('Path')) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': @@ -124,8 +122,8 @@ class CrackleIE(InfoExtractor): for cc_file in cc_files: if not isinstance(cc_file, dict): continue - cc_url = cc_file.get('Path') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc_file.get('Path')) + if not cc_url: continue lang = cc_file.get('Locale') or 'en' subtitles.setdefault(lang, []).append({'url': cc_url}) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index dc0c41b8a..769a219df 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -7,6 +7,7 @@ from ..utils import ( float_or_none, int_or_none, unified_timestamp, + url_or_none, ) @@ -69,7 +70,7 @@ class DctpTvIE(InfoExtractor): endpoint = next( server['endpoint'] for server in servers - if isinstance(server.get('endpoint'), compat_str) and + if url_or_none(server.get('endpoint')) and 'cloudfront' in server['endpoint']) else: endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' @@ -92,8 +93,8 @@ class DctpTvIE(InfoExtractor): for image in images: if not isinstance(image, dict): continue - image_url = image.get('url') - if not image_url or not isinstance(image_url, compat_str): + image_url = url_or_none(image.get('url')) + if not image_url: continue thumbnails.append({ 'url': image_url, diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 3368c4c07..9e7b14a7d 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, @@ -12,6 +11,7 @@ from ..utils import ( parse_age_limit, remove_end, unescapeHTML, + url_or_none, ) @@ -69,9 +69,8 @@ class DiscoveryGoBaseIE(InfoExtractor): captions = stream.get('captions') if isinstance(captions, list): for caption in captions: - subtitle_url = caption.get('fileUrl') - if (not subtitle_url or not isinstance(subtitle_url, compat_str) or - not subtitle_url.startswith('http')): + subtitle_url = url_or_none(caption.get('fileUrl')) + if not subtitle_url or not subtitle_url.startswith('http'): continue lang = caption.get('fileLang', 'en') ext = determine_ext(subtitle_url) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ab32ba4ff..db1de699f 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -7,7 +7,6 @@ import json from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_str, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( parse_age_limit, parse_duration, unified_timestamp, + url_or_none, ) @@ -139,8 +139,8 @@ class DramaFeverIE(DramaFeverBaseIE): for sub in subs: if not isinstance(sub, dict): continue - sub_url = sub.get('url') - if not sub_url or not isinstance(sub_url, compat_str): + sub_url = url_or_none(sub.get('url')) + if not sub_url: continue subtitles.setdefault( sub.get('code') or sub.get('language') or 'en', []).append({ @@ -163,8 +163,8 @@ class DramaFeverIE(DramaFeverBaseIE): for format_id, format_dict in download_assets.items(): if not isinstance(format_dict, dict): continue - format_url = format_dict.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_dict.get('url')) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 42789278e..36fef07b7 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,14 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, unsmuggle_url, + url_or_none, ) @@ -177,7 +175,7 @@ class EaglePlatformIE(InfoExtractor): video_id, 'Downloading mp4 JSON', fatal=False) if mp4_data: for format_id, format_url in mp4_data.get('data', {}).items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue height = int_or_none(format_id) if height is not None and m3u8_formats_dict.get(height): diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index edabaafe6..df11dc206 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) @@ -34,8 +35,8 @@ class EggheadCourseIE(InfoExtractor): entries = [] for lesson in lessons: - lesson_url = lesson.get('http_url') - if not lesson_url or not isinstance(lesson_url, compat_str): + lesson_url = url_or_none(lesson.get('http_url')) + if not lesson_url: continue lesson_id = lesson.get('id') if lesson_id: @@ -95,7 +96,8 @@ class EggheadLessonIE(InfoExtractor): formats = [] for _, format_url in lesson['media_urls'].items(): - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 81f2e2ee1..6d03d7095 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_duration, str_to_int, + url_or_none, ) @@ -82,8 +83,8 @@ class EpornerIE(InfoExtractor): for format_id, format_dict in formats_dict.items(): if not isinstance(format_dict, dict): continue - src = format_dict.get('src') - if not isinstance(src, compat_str) or not src.startswith('http'): + src = url_or_none(format_dict.get('src')) + if not src or not src.startswith('http'): continue if kind == 'hls': formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 4803a22c8..28617d83c 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, qualities, unified_strdate, + url_or_none, ) @@ -88,8 +89,8 @@ class FirstTVIE(InfoExtractor): formats = [] path = None for f in item.get('mbr', []): - src = f.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(f.get('src')) + if not src: continue tbr = int_or_none(self._search_regex( r'_(\d{3,})\.mp4', src, 'tbr', default=None)) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 6fc6b0da0..2ffe83a78 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -16,6 +16,7 @@ from ..utils import ( int_or_none, parse_duration, try_get, + url_or_none, ) from .dailymotion import DailymotionIE @@ -115,14 +116,13 @@ class FranceTVIE(InfoExtractor): def sign(manifest_url, manifest_id): for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): - signed_url = self._download_webpage( + signed_url = url_or_none(self._download_webpage( 'https://%s/esi/TA' % host, video_id, 'Downloading signed %s manifest URL' % manifest_id, fatal=False, query={ 'url': manifest_url, - }) - if (signed_url and isinstance(signed_url, compat_str) and - re.search(r'^(?:https?:)?//', signed_url)): + })) + if signed_url: return signed_url return manifest_url diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py index 770db46d0..cb57ba007 100644 --- a/youtube_dl/extractor/frontendmasters.py +++ b/youtube_dl/extractor/frontendmasters.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, + url_or_none, urlencode_postdata, ) @@ -80,7 +81,7 @@ class FrontendMastersPageBaseIE(FrontendMastersBaseIE): chapters = [] lesson_elements = course.get('lessonElements') if isinstance(lesson_elements, list): - chapters = [e for e in lesson_elements if isinstance(e, compat_str)] + chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)] return chapters @staticmethod diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index aa04905ed..e5a8ffbe8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -32,6 +32,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, UnsupportedError, + url_or_none, xpath_text, ) from .commonprotocols import RtmpIE @@ -3130,8 +3131,8 @@ class GenericIE(InfoExtractor): sources = [sources] formats = [] for source in sources: - src = source.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(source.get('src')) + if not src: continue src = compat_urlparse.urljoin(url, src) src_type = source.get('type') diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index 39fabe8a5..f26f80265 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + url_or_none, urlencode_postdata, ) @@ -80,8 +81,8 @@ class HiDiveIE(InfoExtractor): bitrates = rendition.get('bitrates') if not isinstance(bitrates, dict): continue - m3u8_url = bitrates.get('hls') - if not isinstance(m3u8_url, compat_str): + m3u8_url = url_or_none(bitrates.get('hls')) + if not m3u8_url: continue formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', @@ -93,9 +94,8 @@ class HiDiveIE(InfoExtractor): if not isinstance(cc_file, list) or len(cc_file) < 3: continue cc_lang = cc_file[0] - cc_url = cc_file[2] - if not isinstance(cc_lang, compat_str) or not isinstance( - cc_url, compat_str): + cc_url = url_or_none(cc_file[2]) + if not isinstance(cc_lang, compat_str) or not cc_url: continue subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4bafa54a2..fba01ef49 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, mimetype2ext, parse_duration, qualities, + url_or_none, ) @@ -61,8 +61,8 @@ class ImdbIE(InfoExtractor): for encoding in video_metadata.get('encodings', []): if not encoding or not isinstance(encoding, dict): continue - video_url = encoding.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(encoding.get('videoUrl')) + if not video_url: continue ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) if ext == 'm3u8': diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 0c13f54ee..7e0e838f0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -17,6 +17,7 @@ from ..utils import ( lowercase_escape, std_headers, try_get, + url_or_none, ) @@ -170,7 +171,7 @@ class InstagramIE(InfoExtractor): node = try_get(edge, lambda x: x['node'], dict) if not node: continue - node_video_url = try_get(node, lambda x: x['video_url'], compat_str) + node_video_url = url_or_none(node.get('video_url')) if not node_video_url: continue entries.append({ diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index d05a7b68d..de65b6bb4 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -20,6 +20,7 @@ from ..utils import ( merge_dicts, parse_duration, smuggle_url, + url_or_none, xpath_with_ns, xpath_element, xpath_text, @@ -250,8 +251,8 @@ class ITVIE(InfoExtractor): for sub in subs: if not isinstance(sub, dict): continue - href = sub.get('Href') - if isinstance(href, compat_str): + href = url_or_none(sub.get('Href')) + if href: extract_subtitle(href) if not info.get('duration'): info['duration'] = parse_duration(video_data.get('Duration')) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index d4e6f7ac1..c3eb74c17 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -4,16 +4,14 @@ import re from .common import InfoExtractor from ..aes import aes_decrypt_text -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, ExtractorError, int_or_none, str_to_int, strip_or_none, + url_or_none, ) @@ -55,7 +53,8 @@ class KeezMoviesIE(InfoExtractor): encrypted = False def extract_format(format_url, height=None): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//')): return if format_url in format_urls: return diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index c11cbcf47..dd42bb2f2 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, + url_or_none, ) @@ -109,7 +109,8 @@ class KonserthusetPlayIE(InfoExtractor): captions = source.get('captionsAvailableLanguages') if isinstance(captions, dict): for lang, subtitle_url in captions.items(): - if lang != 'none' and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(subtitle_url) + if lang != 'none' and subtitle_url: subtitles.setdefault(lang, []).append({'url': subtitle_url}) return { diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 0e2645c55..84876b883 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -15,6 +15,7 @@ from ..utils import ( mimetype2ext, unescapeHTML, unsmuggle_url, + url_or_none, urljoin, ) @@ -156,8 +157,8 @@ class MediasiteIE(InfoExtractor): stream_formats = [] for unum, VideoUrl in enumerate(video_urls): - video_url = VideoUrl.get('Location') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(VideoUrl.get('Location')) + if not video_url: continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index d9849a2ba..e03c3d1d3 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -10,6 +10,7 @@ from ..utils import ( parse_resolution, try_get, unified_timestamp, + url_or_none, urljoin, ) @@ -200,8 +201,8 @@ class PeerTubeIE(InfoExtractor): for file_ in video['files']: if not isinstance(file_, dict): continue - file_url = file_.get('fileUrl') - if not file_url or not isinstance(file_url, compat_str): + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: continue file_size = int_or_none(file_.get('size')) format_id = try_get( diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 879bcf81d..10311a81a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, str_to_int, unified_strdate, + url_or_none, ) @@ -71,8 +71,8 @@ class RedTubeIE(InfoExtractor): video_id, fatal=False) if medias and isinstance(medias, list): for media in medias: - format_url = media.get('videoUrl') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(media.get('videoUrl')) + if not format_url: continue format_id = media.get('quality') formats.append({ diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index 8bcf87126..7c8909d95 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -6,6 +6,7 @@ from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, + url_or_none, ) @@ -37,8 +38,8 @@ class RENTVIE(InfoExtractor): title = config['title'] formats = [] for video in config['src']: - src = video.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(video.get('src')) + if not src: continue ext = determine_ext(src) if ext == 'm3u8': diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 89d89b65a..261bcbb83 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -16,6 +16,7 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) @@ -176,8 +177,8 @@ class RutubePlaylistBaseIE(RutubeBaseIE): break for result in results: - video_url = result.get('video_url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(result.get('video_url')) + if not video_url: continue entry = self._extract_video(result, require_title=False) entry.update({ diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 2b7b0d6e1..4a6cbfbb8 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -15,6 +15,7 @@ from ..utils import ( update_url_query, ExtractorError, strip_or_none, + url_or_none, ) @@ -154,8 +155,8 @@ class TurnerBaseIE(AdobePassIE): subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): - track_url = track.get('url') - if not isinstance(track_url, compat_str) or track_url.endswith('/big'): + track_url = url_or_none(track.get('url')) + if not track_url or track_url.endswith('/big'): continue lang = track.get('lang') or track.get('label') or 'en' subtitles.setdefault(lang, []).append({ diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py index 2b2630b91..4222ff9ee 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/youtube_dl/extractor/tvnet.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, unescapeHTML, + url_or_none, ) @@ -106,9 +106,8 @@ class TVNetIE(InfoExtractor): for stream in self._download_json(data_file, video_id): if not isinstance(stream, dict): continue - stream_url = stream.get('url') - if (stream_url in stream_urls or not stream_url or - not isinstance(stream_url, compat_str)): + stream_url = url_or_none(stream.get('url')) + if stream_url in stream_urls or not stream_url: continue stream_urls.add(stream_url) formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index e09b5f804..d3adab457 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -19,6 +19,7 @@ from ..utils import ( try_get, unsmuggle_url, update_url_query, + url_or_none, ) @@ -255,7 +256,8 @@ class TVPlayIE(InfoExtractor): quality = qualities(['hls', 'medium', 'high']) formats = [] for format_id, video_url in streams.get('streams', {}).items(): - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video_url) + if not video_url: continue ext = determine_ext(video_url) if ext == 'f4m': diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index e01f11331..89ee44224 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -27,6 +27,7 @@ from ..utils import ( unified_timestamp, update_url_query, urlencode_postdata, + url_or_none, urljoin, ) @@ -663,8 +664,8 @@ class TwitchClipsIE(TwitchBaseIE): for option in status['quality_options']: if not isinstance(option, dict): continue - source = option.get('source') - if not source or not isinstance(source, compat_str): + source = url_or_none(option.get('source')) + if not source: continue formats.append({ 'url': source, diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index a7196997e..79c45f80e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -20,6 +20,7 @@ from ..utils import ( sanitized_Request, try_get, unescapeHTML, + url_or_none, urlencode_postdata, ) @@ -265,8 +266,8 @@ class UdemyIE(InfoExtractor): if not isinstance(source_list, list): return for source in source_list: - video_url = source.get('file') or source.get('src') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(source.get('file') or source.get('src')) + if not video_url: continue if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -293,8 +294,8 @@ class UdemyIE(InfoExtractor): continue if track.get('kind') != 'captions': continue - src = track.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(track.get('src')) + if not src: continue lang = track.get('language') or track.get( 'srclang') or track.get('label') @@ -314,8 +315,8 @@ class UdemyIE(InfoExtractor): for cc in captions: if not isinstance(cc, dict): continue - cc_url = cc.get('url') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc.get('url')) + if not cc_url: continue lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) sub_dict = (automatic_captions if cc.get('source') == 'auto' diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 59adb2377..174e69cd6 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -3,15 +3,13 @@ from __future__ import unicode_literals import itertools from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, float_or_none, parse_iso8601, + url_or_none, ) @@ -166,8 +164,8 @@ class VidmeIE(InfoExtractor): formats = [] for f in video.get('formats', []): - format_url = f.get('uri') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(f.get('uri')) + if not format_url: continue format_type = f.get('type') if format_type == 'dash': diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 29002b35f..48b5987c2 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -20,6 +20,7 @@ from ..utils import ( str_to_int, unescapeHTML, unified_timestamp, + url_or_none, urlencode_postdata, ) from .dailymotion import DailymotionIE @@ -423,7 +424,8 @@ class VKIE(VKBaseIE): formats = [] for format_id, format_url in data.items(): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//', 'rtmp')): continue if (format_id.startswith(('url', 'cache')) or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index d1bc992fd..68a48034e 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,6 +13,7 @@ from ..utils import ( parse_duration, try_get, unified_strdate, + url_or_none, ) @@ -137,7 +138,8 @@ class XHamsterIE(InfoExtractor): else: format_url = format_item filesize = None - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue formats.append({ 'format_id': '%s-%s' % (format_id, quality), @@ -198,7 +200,8 @@ class XHamsterIE(InfoExtractor): default='{}'), video_id, fatal=False) for format_id, format_url in sources.items(): - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue if format_url in format_urls: continue diff --git a/youtube_dl/extractor/yapfiles.py b/youtube_dl/extractor/yapfiles.py index 7fafbf596..cfb368de9 100644 --- a/youtube_dl/extractor/yapfiles.py +++ b/youtube_dl/extractor/yapfiles.py @@ -4,12 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, qualities, unescapeHTML, + url_or_none, ) @@ -80,9 +80,9 @@ class YapFilesIE(InfoExtractor): formats = [] for format_id in QUALITIES: is_hd = format_id == 'hd' - format_url = playlist.get( - 'file%s' % ('_hd' if is_hd else '')) - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(playlist.get( + 'file%s' % ('_hd' if is_hd else ''))) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index f33fabe19..dff69fcb7 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, parse_duration, + url_or_none, ) @@ -50,8 +50,8 @@ class YouJizzIE(InfoExtractor): for encoding in encodings: if not isinstance(encoding, dict): continue - format_url = encoding.get('filename') - if not isinstance(format_url, compat_str): + format_url = url_or_none(encoding.get('filename')) + if not format_url: continue if determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 547adefeb..ea0bce784 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, sanitized_Request, str_to_int, unescapeHTML, unified_strdate, + url_or_none, ) from ..aes import aes_decrypt_text @@ -88,8 +88,8 @@ class YouPornIE(InfoExtractor): for definition in definitions: if not isinstance(definition, dict): continue - video_url = definition.get('videoUrl') - if isinstance(video_url, compat_str) and video_url: + video_url = url_or_none(definition.get('videoUrl')) + if video_url: links.append(video_url) # Fallback #1, this also contains extra low quality 180p format diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index b5a3a0716..fb167c198 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -13,6 +13,7 @@ from ..utils import ( ExtractorError, int_or_none, try_get, + url_or_none, urlencode_postdata, ) @@ -150,8 +151,8 @@ class ZattooBaseIE(InfoExtractor): for watch in watch_urls: if not isinstance(watch, dict): continue - watch_url = watch.get('url') - if not watch_url or not isinstance(watch_url, compat_str): + watch_url = url_or_none(watch.get('url')) + if not watch_url: continue format_id_list = [stream_type] maxrate = watch.get('maxrate') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index bb9020c91..afa3f6c47 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -15,6 +15,7 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, + url_or_none, urljoin, ) @@ -67,8 +68,8 @@ class ZDFIE(ZDFBaseIE): def _extract_subtitles(src): subtitles = {} for caption in try_get(src, lambda x: x['captions'], list) or []: - subtitle_url = caption.get('uri') - if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(caption.get('uri')) + if subtitle_url: lang = caption.get('language', 'deu') subtitles.setdefault(lang, []).append({ 'url': subtitle_url, @@ -76,8 +77,8 @@ class ZDFIE(ZDFBaseIE): return subtitles def _extract_format(self, video_id, formats, format_urls, meta): - format_url = meta.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(meta.get('url')) + if not format_url: return if format_url in format_urls: return @@ -152,7 +153,8 @@ class ZDFIE(ZDFBaseIE): content, lambda x: x['teaserImageRef']['layouts'], dict) if layouts: for layout_key, layout_url in layouts.items(): - if not isinstance(layout_url, compat_str): + layout_url = url_or_none(layout_url) + if not layout_url: continue thumbnail = { 'url': layout_url, From 6f27998e750d8409f03cce2754ea3e9066b3b794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 20:58:30 +0700 Subject: [PATCH 083/111] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1d602079e..bda7be4a9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version <unreleased> + +Core ++ [utils] Introduce url_or_none +* [utils] Allow JSONP without function name (#17028) ++ [extractor/common] Extract DASH and MSS formats from SMIL manifests + +Extractors ++ [bbc] Add support for BBC Radio Play pages (#17022) +* [iwara] Fix download URLs (#17026) +* [vrtnu] Relax title extraction and extract JSON-LD (#17018) ++ [viu] Pass Referer and Origin headers and area id (#16992) ++ [vimeo] Add another config regular expression (#17013) ++ [facebook] Extract view count (#16942) +* [dailymotion] Improve description extraction (#16984) +* [slutload] Fix and improve extraction (#17001) +* [mediaset] Fix extraction (#16977) ++ [theplatform] Add support for theplatform TLD customization (#16977) +* [imgur] Relax URL regular expression (#16987) +* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, + #16959) + + version 2018.07.10 Core From 8e66ffc3b7df8ad78e7f9e2e77d026c84027a814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 21:00:18 +0700 Subject: [PATCH 084/111] release 2018.07.21 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f192c6633..24827ba8f 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.21*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.21** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.07.10 +[debug] youtube-dl version 2018.07.21 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index bda7be4a9..94ecaae8e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.21 Core + [utils] Introduce url_or_none diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c7083cf47..9bf0ea30d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.10' +__version__ = '2018.07.21' From 6de82b44768624b888fd141e009540ff3bed9e6a Mon Sep 17 00:00:00 2001 From: Enes <enessolak99@gmail.com> Date: Tue, 24 Apr 2018 19:02:38 +0300 Subject: [PATCH 085/111] [puhutv] Add extractor (closes #16010) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/puhutv.py | 232 +++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 youtube_dl/extractor/puhutv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c6f8a785a..29fab5b9a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -860,6 +860,10 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) from .presstv import PressTVIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py new file mode 100644 index 000000000..8abdab52a --- /dev/null +++ b/youtube_dl/extractor/puhutv.py @@ -0,0 +1,232 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + determine_ext, + str_or_none, + url_or_none, + unified_strdate, + unified_timestamp, + try_get, + url_basename, + remove_end +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [ + { + # A Film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'a347470371d56e1585d1b2c8dab01c96', + 'info_dict': { + 'id': 'sut-kardesler', + 'display_id': '5085', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Arzu Film', + 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'uploader_id': '43', + 'upload_date': '20160729', + 'timestamp': int, + }, + }, + { + # An Episode and geo restricted + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, + { + # Has subtitle + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + } + ] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'https://puhutv.com/api/slug/%s-izle' % video_id, video_id)['data'] + + display_id = compat_str(info['id']) + title = info['title']['name'] + if info.get('display_name'): + title = '%s %s' % (title, info.get('display_name')) + + description = try_get(info, lambda x: x['title']['description'], compat_str) or info.get('description') + timestamp = unified_timestamp(info.get('created_at')) + upload_date = unified_strdate(info.get('created_at')) + uploader = try_get(info, lambda x: x['title']['producer']['name'], compat_str) + uploader_id = str_or_none(try_get(info, lambda x: x['title']['producer']['id'])) + view_count = int_or_none(try_get(info, lambda x: x['content']['watch_count'])) + duration = float_or_none(try_get(info, lambda x: x['content']['duration_in_ms']), scale=1000) + thumbnail = try_get(info, lambda x: x['content']['images']['wide']['main'], compat_str) + release_year = int_or_none(try_get(info, lambda x: x['title']['released_at'])) + webpage_url = info.get('web_url') + + season_number = int_or_none(info.get('season_number')) + season_id = int_or_none(info.get('season_id')) + episode_number = int_or_none(info.get('episode_number')) + + tags = [] + for tag in try_get(info, lambda x: x['title']['genres'], list) or []: + if isinstance(tag.get('name'), compat_str): + tags.append(tag.get('name')) + + thumbnails = [] + thumbs_dict = try_get(info, lambda x: x['content']['images']['wide'], dict) or {} + for id, url in thumbs_dict.items(): + if not url_or_none(url): + continue + thumbnails.append({ + 'url': 'https://%s' % url, + 'id': id + }) + + subtitles = {} + for subtitle in try_get(info, lambda x: x['content']['subtitles'], list) or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + # Some of videos are geo restricted upon request copyright owner and returns 403 + req_formats = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % display_id, + video_id, 'Downloading video JSON') + + formats = [] + for format in req_formats['data']['videos']: + media_url = url_or_none(format.get('url')) + if not media_url: + continue + ext = format.get('video_format') or determine_ext(media_url) + quality = format.get('quality') + if format.get('stream_type') == 'hls' and format.get('is_playlist') is True: + m3u8_id = remove_end(url_basename(media_url), '.m3u8') + formats.append(self._m3u8_meta_format(media_url, ext, m3u8_id=m3u8_id)) + elif ext == 'mp4' and format.get('is_playlist', False) is False: + formats.append({ + 'url': media_url, + 'format_id': 'http-%s' % quality, + 'ext': ext, + 'height': quality + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season_id': season_id, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_year': release_year, + 'upload_date': upload_date, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'duration': duration, + 'tags': tags, + 'subtitles': subtitles, + 'webpage_url': webpage_url, + 'thumbnail': thumbnail, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [ + { + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + 'uploader': 'Focus Film', + 'uploader_id': 61, + }, + 'playlist_mincount': 234, + }, + { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'info_dict': { + 'title': 'Kaybedenler Kulübü', + 'id': 'kaybedenler-kulubu', + 'uploader': 'Tolga Örnek, Murat Dörtbudak, Neslihan Dörtbudak, Kemal Kaplanoğlu', + 'uploader_id': 248, + }, + 'playlist_mincount': 1, + }, + ] + + def _extract_entries(self, playlist_id, seasons): + for season in seasons: + season_id = season['id'] + season_number = season.get('position') + pagenum = 1 + has_more = True + while has_more is True: + season_info = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + playlist_id, 'Downloading season %s page %s' % (season_number, pagenum), query={ + 'page': pagenum, + 'per': 40, + }) + for episode in season_info.get('episodes'): + video_id = episode['slugPath'].replace('-izle', '') + yield self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) + pagenum = pagenum + 1 + has_more = season_info.get('hasMore', False) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + 'https://puhutv.com/api/slug/%s-detay' % playlist_id, playlist_id)['data'] + + title = info.get('name') + uploader = try_get(info, lambda x: x['producer']['name'], compat_str) + uploader_id = try_get(info, lambda x: x['producer']['id']) + seasons = info.get('seasons') + if seasons: + entries = self._extract_entries(playlist_id, seasons) + else: + # For films, these are using same url with series + video_id = info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'entries': entries, + } From 8fd2a7be373a29b9bea491f952f14315a90c2f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 20:25:46 +0700 Subject: [PATCH 086/111] [puhutv] Improve extraction (closes #16269) --- youtube_dl/extractor/puhutv.py | 313 +++++++++++++++++---------------- 1 file changed, 164 insertions(+), 149 deletions(-) diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py index 8abdab52a..5465e8ab7 100644 --- a/youtube_dl/extractor/puhutv.py +++ b/youtube_dl/extractor/puhutv.py @@ -2,53 +2,54 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, int_or_none, float_or_none, - determine_ext, + parse_resolution, str_or_none, - url_or_none, - unified_strdate, - unified_timestamp, try_get, - url_basename, - remove_end + unified_timestamp, + url_or_none, + urljoin, ) class PuhuTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-izle' + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' IE_NAME = 'puhutv' - _TESTS = [ - { - # A Film - 'url': 'https://puhutv.com/sut-kardesler-izle', - 'md5': 'a347470371d56e1585d1b2c8dab01c96', - 'info_dict': { - 'id': 'sut-kardesler', - 'display_id': '5085', - 'ext': 'mp4', - 'title': 'Süt Kardeşler', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Arzu Film', - 'description': 'md5:405fd024df916ca16731114eb18e511a', - 'uploader_id': '43', - 'upload_date': '20160729', - 'timestamp': int, - }, + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1469778212, + 'upload_date': '20160729', + 'release_year': 1976, + 'view_count': int, + 'tags': ['Aile', 'Komedi', 'Klasikler'], }, - { - # An Episode and geo restricted - 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', - 'only_matching': True, - }, - { - # Has subtitle - 'url': 'https://puhutv.com/dip-1-bolum-izle', - 'only_matching': True, - } - ] + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] _SUBTITLE_LANGS = { 'English': 'en', 'Deutsch': 'de', @@ -56,47 +57,103 @@ class PuhuTVIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'https://puhutv.com/api/slug/%s-izle' % video_id, video_id)['data'] + display_id = self._match_id(url) - display_id = compat_str(info['id']) - title = info['title']['name'] + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + title = info.get('name') or info['title']['name'] if info.get('display_name'): title = '%s %s' % (title, info.get('display_name')) - description = try_get(info, lambda x: x['title']['description'], compat_str) or info.get('description') + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted() + raise + + formats = [] + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url: + continue + playlist = video.get('is_playlist') + if video.get('stream_type') == 'hls' and playlist is True: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + if video_format == 'hls' and playlist is False: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + self._sort_formats(formats) + + description = try_get( + info, lambda x: x['title']['description'], + compat_str) or info.get('description') timestamp = unified_timestamp(info.get('created_at')) - upload_date = unified_strdate(info.get('created_at')) - uploader = try_get(info, lambda x: x['title']['producer']['name'], compat_str) - uploader_id = str_or_none(try_get(info, lambda x: x['title']['producer']['id'])) - view_count = int_or_none(try_get(info, lambda x: x['content']['watch_count'])) - duration = float_or_none(try_get(info, lambda x: x['content']['duration_in_ms']), scale=1000) - thumbnail = try_get(info, lambda x: x['content']['images']['wide']['main'], compat_str) - release_year = int_or_none(try_get(info, lambda x: x['title']['released_at'])) - webpage_url = info.get('web_url') + creator = try_get( + info, lambda x: x['title']['producer']['name'], compat_str) + + duration = float_or_none( + try_get(info, lambda x: x['content']['duration_in_ms'], int), + scale=1000) + view_count = try_get(info, lambda x: x['content']['watch_count'], int) + + images = try_get( + info, lambda x: x['content']['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + release_year = try_get(info, lambda x: x['title']['released_at'], int) season_number = int_or_none(info.get('season_number')) - season_id = int_or_none(info.get('season_id')) + season_id = str_or_none(info.get('season_id')) episode_number = int_or_none(info.get('episode_number')) tags = [] - for tag in try_get(info, lambda x: x['title']['genres'], list) or []: - if isinstance(tag.get('name'), compat_str): - tags.append(tag.get('name')) - - thumbnails = [] - thumbs_dict = try_get(info, lambda x: x['content']['images']['wide'], dict) or {} - for id, url in thumbs_dict.items(): - if not url_or_none(url): + for genre in try_get(info, lambda x: x['title']['genres'], list) or []: + if not isinstance(genre, dict): continue - thumbnails.append({ - 'url': 'https://%s' % url, - 'id': id - }) + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) subtitles = {} - for subtitle in try_get(info, lambda x: x['content']['subtitles'], list) or []: + for subtitle in try_get( + info, lambda x: x['content']['subtitles'], list) or []: if not isinstance(subtitle, dict): continue lang = subtitle.get('language') @@ -107,30 +164,6 @@ class PuhuTVIE(InfoExtractor): 'url': sub_url }] - # Some of videos are geo restricted upon request copyright owner and returns 403 - req_formats = self._download_json( - 'https://puhutv.com/api/assets/%s/videos' % display_id, - video_id, 'Downloading video JSON') - - formats = [] - for format in req_formats['data']['videos']: - media_url = url_or_none(format.get('url')) - if not media_url: - continue - ext = format.get('video_format') or determine_ext(media_url) - quality = format.get('quality') - if format.get('stream_type') == 'hls' and format.get('is_playlist') is True: - m3u8_id = remove_end(url_basename(media_url), '.m3u8') - formats.append(self._m3u8_meta_format(media_url, ext, m3u8_id=m3u8_id)) - elif ext == 'mp4' and format.get('is_playlist', False) is False: - formats.append({ - 'url': media_url, - 'format_id': 'http-%s' % quality, - 'ext': ext, - 'height': quality - }) - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, @@ -140,93 +173,75 @@ class PuhuTVIE(InfoExtractor): 'season_number': season_number, 'episode_number': episode_number, 'release_year': release_year, - 'upload_date': upload_date, 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'creator': creator, 'view_count': view_count, 'duration': duration, 'tags': tags, 'subtitles': subtitles, - 'webpage_url': webpage_url, - 'thumbnail': thumbnail, 'thumbnails': thumbnails, 'formats': formats } class PuhuTVSerieIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-detay' + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' IE_NAME = 'puhutv:serie' - _TESTS = [ - { - 'url': 'https://puhutv.com/deniz-yildizi-detay', - 'info_dict': { - 'title': 'Deniz Yıldızı', - 'id': 'deniz-yildizi', - 'uploader': 'Focus Film', - 'uploader_id': 61, - }, - 'playlist_mincount': 234, + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', }, - { - # a film detail page which is using same url with serie page - 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', - 'info_dict': { - 'title': 'Kaybedenler Kulübü', - 'id': 'kaybedenler-kulubu', - 'uploader': 'Tolga Örnek, Murat Dörtbudak, Neslihan Dörtbudak, Kemal Kaplanoğlu', - 'uploader_id': 248, - }, - 'playlist_mincount': 1, - }, - ] + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] - def _extract_entries(self, playlist_id, seasons): + def _extract_entries(self, seasons): for season in seasons: - season_id = season['id'] - season_number = season.get('position') - pagenum = 1 + season_id = season.get('id') + if not season_id: + continue + page = 1 has_more = True while has_more is True: - season_info = self._download_json( + season = self._download_json( 'https://galadriel.puhutv.com/seasons/%s' % season_id, - playlist_id, 'Downloading season %s page %s' % (season_number, pagenum), query={ - 'page': pagenum, + season_id, 'Downloading page %s' % page, query={ + 'page': page, 'per': 40, }) - for episode in season_info.get('episodes'): - video_id = episode['slugPath'].replace('-izle', '') - yield self.url_result( - 'https://puhutv.com/%s-izle' % video_id, - PuhuTVIE.ie_key(), video_id) - pagenum = pagenum + 1 - has_more = season_info.get('hasMore', False) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') def _real_extract(self, url): playlist_id = self._match_id(url) info = self._download_json( - 'https://puhutv.com/api/slug/%s-detay' % playlist_id, playlist_id)['data'] + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] - title = info.get('name') - uploader = try_get(info, lambda x: x['producer']['name'], compat_str) - uploader_id = try_get(info, lambda x: x['producer']['id']) seasons = info.get('seasons') if seasons: - entries = self._extract_entries(playlist_id, seasons) - else: - # For films, these are using same url with series - video_id = info['assets'][0]['slug'] - return self.url_result( - 'https://puhutv.com/%s-izle' % video_id, - PuhuTVIE.ie_key(), video_id) + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': title, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'entries': entries, - } + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) From a702056fbe0e4dc615ebc73d6891c505cd7d818f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:26:12 +0700 Subject: [PATCH 087/111] Credit @bastiandg for #16189 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index eaf96d79d..c9f7189ac 100644 --- a/AUTHORS +++ b/AUTHORS @@ -239,3 +239,4 @@ Martin Weinelt Surya Oktafendri TingPing Alexandre Macabies +Bastian de Groot From 7930f914949cbf28aa522463a22f0eea396875bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:27:28 +0700 Subject: [PATCH 088/111] Credit @haasn for #16326 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index c9f7189ac..9b9fff3c3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -240,3 +240,4 @@ Surya Oktafendri TingPing Alexandre Macabies Bastian de Groot +Niklas Haas From d94fb1225edc546f81f4b01b2eebcc9e4cd89313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:29:25 +0700 Subject: [PATCH 089/111] Credit @dnet for #16174 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 9b9fff3c3..0b23f4066 100644 --- a/AUTHORS +++ b/AUTHORS @@ -241,3 +241,4 @@ TingPing Alexandre Macabies Bastian de Groot Niklas Haas +András Veres-Szentkirályi From 694079dff754e72e8b3b6c1395f1e9b8e9a66db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:31:46 +0700 Subject: [PATCH 090/111] Credit @mrfade for #16269 and #16271 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 0b23f4066..af8accb51 100644 --- a/AUTHORS +++ b/AUTHORS @@ -242,3 +242,4 @@ Alexandre Macabies Bastian de Groot Niklas Haas András Veres-Szentkirályi +Enes Solak From a789d1cc90055aeefa4a2dafb4a81fcf43b65bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:34:34 +0700 Subject: [PATCH 091/111] Credit @nathanrossi for #16554 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index af8accb51..c2f48668b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -243,3 +243,4 @@ Bastian de Groot Niklas Haas András Veres-Szentkirályi Enes Solak +Nathan Rossi From 234a85858cd50d6c88848c2e0ceffeabe26306e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:35:38 +0700 Subject: [PATCH 092/111] Credit @tmsbrg for #15462 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index c2f48668b..58b695784 100644 --- a/AUTHORS +++ b/AUTHORS @@ -244,3 +244,4 @@ Niklas Haas András Veres-Szentkirályi Enes Solak Nathan Rossi +Thomas van der Berg From d4e7065111b1c18bc47795b102a4e5c6757e9bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:36:58 +0700 Subject: [PATCH 093/111] Credit @Kerruba for #16328 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 58b695784..b507cb8df 100644 --- a/AUTHORS +++ b/AUTHORS @@ -245,3 +245,4 @@ András Veres-Szentkirályi Enes Solak Nathan Rossi Thomas van der Berg +Luca Cherubin From 631f93ee2d5dfe5a90da38d293159670ada4d95e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 23 Jul 2018 06:20:00 +0100 Subject: [PATCH 094/111] [facebook] fix tahoe request for authenticated users(closes #16655) --- youtube_dl/extractor/facebook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f78479b92..97cfe0fc3 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -355,7 +355,6 @@ class FacebookIE(InfoExtractor): tahoe_data = self._download_webpage( self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, data=urlencode_postdata({ - '__user': 0, '__a': 1, '__pc': self._search_regex( r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, @@ -363,6 +362,9 @@ class FacebookIE(InfoExtractor): '__rev': self._search_regex( r'client_revision["\']\s*:\s*(\d+),', webpage, 'client revision', default='3944515'), + 'fb_dtsg': self._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + webpage, 'dtsg token', default=''), }), headers={ 'Content-Type': 'application/x-www-form-urlencoded', From b5dec62ca688f35feac1f56415355e9a8e850edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Jul 2018 23:07:12 +0700 Subject: [PATCH 095/111] [streamcloud] Fix extraction (closes #17054) --- youtube_dl/extractor/streamcloud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 6a6bb90c4..4a410611d 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -72,4 +72,7 @@ class StreamcloudIE(InfoExtractor): 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'http_headers': { + 'Referer': url, + }, } From ad1bc71a8a448583734b313bc77a2097200ad97b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 26 Jul 2018 07:24:46 +0100 Subject: [PATCH 096/111] [vk] fix extraction for inline only videos(fixes #16923) --- youtube_dl/extractor/vk.py | 40 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 48b5987c2..ef8b9bcb7 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -17,6 +17,7 @@ from ..utils import ( int_or_none, orderedSet, remove_start, + str_or_none, str_to_int, unescapeHTML, unified_timestamp, @@ -106,10 +107,10 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'uploader_id': '-77521', 'duration': 195, - 'timestamp': 1329060660, + 'timestamp': 1329049880, 'upload_date': '20120212', - 'view_count': int, }, }, { @@ -118,12 +119,12 @@ class VKIE(VKBaseIE): 'info_dict': { 'id': '165548505', 'ext': 'mp4', - 'uploader': 'Tom Cruise', 'title': 'No name', + 'uploader': 'Tom Cruise', + 'uploader_id': '205387401', 'duration': 9, - 'timestamp': 1374374880, - 'upload_date': '20130721', - 'view_count': int, + 'timestamp': 1374364108, + 'upload_date': '20130720', } }, { @@ -207,10 +208,10 @@ class VKIE(VKBaseIE): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:d9903938abdc74c738af77f527ca0596', - 'duration': 178, + 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'duration': 179, 'upload_date': '20130116', - 'uploader': "Children's Joy Foundation", + 'uploader': "Children's Joy Foundation Inc.", 'uploader_id': 'thecjf', 'view_count': int, }, @@ -222,6 +223,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', + # TODO: fix test by fixing dailymotion description extraction 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', @@ -241,9 +243,12 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', - 'timestamp': 1454870100, + 'uploader_id': '-110305615', + 'timestamp': 1454859345, 'upload_date': '20160207', - 'view_count': int, + }, + 'params': { + 'skip_download': True, }, }, { @@ -296,7 +301,7 @@ class VKIE(VKBaseIE): video_id = mobj.group('videoid') if video_id: - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: @@ -346,6 +351,9 @@ class VKIE(VKBaseIE): r'<!>This video is no longer available, because its author has been blocked.': 'Video %s is no longer available, because its author has been blocked.', + + r'<!>This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', } for error_re, error_msg in ERRORS.items(): @@ -394,7 +402,8 @@ class VKIE(VKBaseIE): if not data: data = self._parse_json( self._search_regex( - r'<!json>\s*({.+?})\s*<!>', info_page, 'json', default='{}'), + [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'], + info_page, 'json', default='{}'), video_id) if data: data = data['player']['params'][0] @@ -416,7 +425,7 @@ class VKIE(VKBaseIE): timestamp = unified_timestamp(self._html_search_regex( r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, - 'upload date', fatal=False)) + 'upload date', default=None)) or int_or_none(data.get('date')) view_count = str_to_int(self._search_regex( r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', @@ -454,9 +463,12 @@ class VKIE(VKBaseIE): 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'uploader_id': str_or_none(data.get('author_id')), 'duration': data.get('duration'), 'timestamp': timestamp, 'view_count': view_count, + 'like_count': int_or_none(data.get('liked')), + 'dislike_count': int_or_none(data.get('nolikes')), 'is_live': is_live, } From 0c7b4f49eb07bb68918da3dd7ff277565273033f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 26 Jul 2018 08:11:06 +0100 Subject: [PATCH 097/111] [rai] return non http relinker URL intact(closes #17055) --- youtube_dl/extractor/rai.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index d22311031..f916b2619 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -32,6 +32,9 @@ class RaiBaseIE(InfoExtractor): _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + formats = [] geoprotection = None is_live = None @@ -369,6 +372,10 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): From 722f1a0f8f82617b3abf646b2d0df2c624e98912 Mon Sep 17 00:00:00 2001 From: Sidney de Koning <sidney.dekoning@gmail.com> Date: Fri, 27 Jul 2018 19:18:41 +0200 Subject: [PATCH 098/111] [README.md] Actualize Firefox cookie export add-on Previous one does not work with newer Firefox versions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6d49d6a4f..dd068a462 100644 --- a/README.md +++ b/README.md @@ -870,7 +870,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. From 8e37a7e4cce7555c12b8d9f7a1d331476aba357c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 28 Jul 2018 06:52:36 +0100 Subject: [PATCH 099/111] [mitele] reduce number of requests and update tests --- youtube_dl/extractor/mitele.py | 109 +++------------------------------ 1 file changed, 10 insertions(+), 99 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 42759eae8..40f214a87 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,84 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import uuid - from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( int_or_none, - extract_attributes, - determine_ext, smuggle_url, parse_duration, ) -class MiTeleBaseIE(InfoExtractor): - def _get_player_info(self, url, webpage): - player_data = extract_attributes(self._search_regex( - r'(?s)(<ms-video-player.+?</ms-video-player>)', - webpage, 'ms video player')) - video_id = player_data['data-media-id'] - if player_data.get('data-cms-id') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id) - config_url = compat_urlparse.urljoin(url, player_data['data-config']) - config = self._download_json( - config_url, video_id, 'Downloading config JSON') - mmc_url = config['services']['mmc'] - - duration = None - formats = [] - for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): - mmc = self._download_json( - m_url, video_id, 'Downloading mmc JSON') - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, - } - - class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player' @@ -86,7 +16,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'info_dict': { - 'id': '57b0dfb9c715da65618b4afa', + 'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg', 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', @@ -104,7 +34,7 @@ class MiTeleIE(InfoExtractor): # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', 'info_dict': { - 'id': '57b0de3dc915da14058b4876', + 'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq', 'ext': 'mp4', 'title': 'Cuarto Milenio Temporada 6 Programa 226', 'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f', @@ -128,40 +58,21 @@ class MiTeleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - gigya_url = self._search_regex( - r'<gigya-api>[^>]*</gigya-api>[^>]*<script\s+src="([^"]*)">[^>]*</script>', - webpage, 'gigya', default=None) - gigya_sc = self._download_webpage( - compat_urlparse.urljoin('http://www.mitele.es/', gigya_url), - video_id, 'Downloading gigya script') - - # Get a appKey/uuid for getting the session key - appKey = self._search_regex( - r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', - gigya_sc, 'appKey') - - session_json = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/session', - video_id, 'Downloading session keys', query={ - 'appKey': appKey, - 'uuid': compat_str(uuid.uuid4()), - }) paths = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration', - video_id, 'Downloading paths JSON', - query={'sessionKey': compat_str(session_json['sessionKey'])}) + 'https://www.mitele.es/amd/agp/web/metadata/general_configuration', + video_id, 'Downloading paths JSON') ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search'] + base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com') + full_path = ooyala_s.get('full_path', '/search/v1/full/providers/') source = self._download_json( - 'http://%s%s%s/docs/%s' % ( - ooyala_s['base_url'], ooyala_s['full_path'], - ooyala_s['provider_id'], video_id), + '%s://%s%s%s/docs/%s' % ( + ooyala_s.get('protocol', 'https'), base_url, full_path, + ooyala_s.get('provider_id', '104951'), video_id), video_id, 'Downloading data JSON', query={ 'include_titles': 'Series,Season', - 'product_name': 'test', + 'product_name': ooyala_s.get('product_name', 'test'), 'format': 'full', })['hits']['hits'][0]['_source'] From a098c99f0d0deb4ad0d5c9b67496582d89970368 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 28 Jul 2018 06:55:18 +0100 Subject: [PATCH 100/111] [telecinco] fix extraction(closes #17080) --- youtube_dl/extractor/telecinco.py | 122 ++++++++++++++++++++++++++---- 1 file changed, 106 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index fdcc7d573..d37e1b055 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,26 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals -from .mitele import MiTeleBaseIE +import json +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + str_or_none, + urljoin, +) -class TelecincoIE(MiTeleBaseIE): +class TelecincoIE(InfoExtractor): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', - 'md5': '8d7b2d5f699ee2709d992a63d5cd1712', 'info_dict': { - 'id': 'JEA5ijCnF6p5W08A1rNKn7', - 'ext': 'mp4', + 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', - 'duration': 662, }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'JEA5ijCnF6p5W08A1rNKn7', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido', + 'duration': 662, + }, + }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '284393e5387b3b947b77c613ef04749a', + 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -30,7 +47,7 @@ class TelecincoIE(MiTeleBaseIE): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': '749afab6ea5a136a8806855166ae46a2', + 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -50,17 +67,90 @@ class TelecincoIE(MiTeleBaseIE): 'only_matching': True, }] + def _parse_content(self, content, url): + video_id = content['dataMediaId'] + if content.get('dataCmsId') == 'ooyala': + return self.url_result( + 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) + config_url = urljoin(url, content['dataConfig']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + title = config['info']['title'] + + def mmc_url(mmc_type): + return re.sub( + r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, + config['services']['mmc']) + + duration = None + formats = [] + for mmc_type in ('flash', 'html5'): + mmc = self._download_json( + mmc_url(mmc_type), video_id, + 'Downloading %s mmc JSON' % mmc_type, fatal=False) + if not mmc: + continue + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + gcp = location.get('gcp') + ogn = location.get('ogn') + if None in (gat, gcp, ogn): + continue + token_data = { + 'gcp': gcp, + 'ogn': ogn, + 'sta': 0, + } + media = self._download_json( + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }, fatal=False) or {} + stream = media.get('stream') or media.get('file') + if not stream: + continue + ext = determine_ext(stream) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, + } + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title') - info = self._get_player_info(url, webpage) + article = self._parse_json(self._search_regex( + r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + webpage, 'article'), display_id)['article'] + title = article.get('title') + description = clean_html(article.get('leadParagraph')) + if article.get('editorialType') != 'VID': + entries = [] + for p in article.get('body', []): + content = p.get('content') + if p.get('type') != 'video' or not content: + continue + entries.append(self._parse_content(content, url)) + return self.playlist_result( + entries, str_or_none(article.get('id')), title, description) + content = article['opening']['content'] + info = self._parse_content(content, url) info.update({ - 'display_id': display_id, - 'title': title, - 'description': self._html_search_meta( - ['og:description', 'twitter:description'], - webpage, 'title', fatal=False), + 'description': description, }) return info From 9a984265b90eb0e8a7c26a1edf479fbfcebce0a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Jul 2018 21:26:23 +0700 Subject: [PATCH 101/111] [ted] Fix extraction for videos without nativeDownloads (closes #16756, closes #17085) --- youtube_dl/extractor/ted.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 06a27fd04..dc9c5ce8e 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -107,6 +107,19 @@ class TEDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # no nativeDownloads + 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', + 'info_dict': { + 'id': '1792', + 'ext': 'mp4', + 'title': 'The orchestra in my mouth', + 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', + 'uploader': 'Tom Thum', + }, + 'params': { + 'skip_download': True, + }, }] _NATIVE_FORMATS = { @@ -180,8 +193,10 @@ class TEDIE(InfoExtractor): } native_downloads = try_get( - talk_info, lambda x: x['downloads']['nativeDownloads'], - dict) or talk_info['nativeDownloads'] + talk_info, + (lambda x: x['downloads']['nativeDownloads'], + lambda x: x['nativeDownloads']), + dict) or {} formats = [{ 'url': format_url, From cd3a3ff93bd5d6866d3822cb438b0e172ffe4e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Jul 2018 22:09:53 +0700 Subject: [PATCH 102/111] [ted] Improve extraction and update tests --- youtube_dl/extractor/ted.py | 111 ++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index dc9c5ce8e..212ac80ab 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -7,8 +7,10 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + float_or_none, int_or_none, try_get, + url_or_none, ) @@ -30,7 +32,7 @@ class TEDIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': '0de43ac406aa3e4ea74b66c9c7789b13', + 'md5': 'b0ce2b05ca215042124fbc9e3886493a', 'info_dict': { 'id': '102', 'ext': 'mp4', @@ -42,24 +44,30 @@ class TEDIE(InfoExtractor): 'uploader': 'Dan Dennett', 'width': 853, 'duration': 1308, - } - }, { - 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', - 'md5': 'b899ac15e345fb39534d913f7606082b', - 'info_dict': { - 'id': 'tSVI8ta_P4w', - 'ext': 'mp4', - 'title': 'Vishal Sikka: The beauty and power of algorithms', - 'thumbnail': r're:^https?://.+\.jpg', - 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', - 'upload_date': '20140122', - 'uploader_id': 'TEDInstitute', - 'uploader': 'TED Institute', + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # missing HTTP bitrates + 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', + 'info_dict': { + 'id': '6069', + 'ext': 'mp4', + 'title': 'The beauty and power of algorithms', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:734e352710fb00d840ab87ae31aaf688', + 'uploader': 'Vishal Sikka', + }, + 'params': { + 'skip_download': True, }, - 'add_ie': ['Youtube'], }, { 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', - 'md5': '71b3ab2f4233012dce09d515c9c39ce2', + 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 'info_dict': { 'id': '1972', 'ext': 'mp4', @@ -68,6 +76,9 @@ class TEDIE(InfoExtractor): 'description': 'md5:5174aed4d0f16021b704120360f72b92', 'duration': 1128, }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.ted.com/playlists/who_are_the_hackers', 'info_dict': { @@ -91,22 +102,6 @@ class TEDIE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - # YouTube video - 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': 'aFBIPO-P7LM', - 'ext': 'mp4', - 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', - 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', - 'uploader': 'TEDx Talks', - 'uploader_id': 'TEDxTalks', - 'upload_date': '20111216', - }, - 'params': { - 'skip_download': True, - }, }, { # no nativeDownloads 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', @@ -116,6 +111,9 @@ class TEDIE(InfoExtractor): 'title': 'The orchestra in my mouth', 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', 'uploader': 'Tom Thum', + 'view_count': int, + 'comment_count': int, + 'tags': list, }, 'params': { 'skip_download': True, @@ -174,24 +172,11 @@ class TEDIE(InfoExtractor): info = self._extract_info(webpage) - talk_info = try_get( - info, lambda x: x['__INITIAL_DATA__']['talks'][0], - dict) or info['talks'][0] + data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info + talk_info = data['talks'][0] title = talk_info['title'].strip() - external = talk_info.get('external') - if external: - service = external['service'] - self.to_screen('Found video from %s' % service) - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } - native_downloads = try_get( talk_info, (lambda x: x['downloads']['nativeDownloads'], @@ -211,10 +196,24 @@ class TEDIE(InfoExtractor): player_talk = talk_info['player_talks'][0] + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + return { + '_type': 'url', + 'url': ext_url or external['uri'], + } + resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None for format_id, resources in resources_.items(): + if not isinstance(resources, dict): + continue if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -243,8 +242,12 @@ class TEDIE(InfoExtractor): 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': + stream_url = url_or_none(resources.get('stream')) + if not stream_url: + continue formats.extend(self._extract_m3u8_formats( - resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) + stream_url, video_name, 'mp4', m3u8_id=format_id, + fatal=False)) m3u8_formats = list(filter( lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', @@ -254,9 +257,13 @@ class TEDIE(InfoExtractor): bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) if not bitrate: continue + bitrate_url = re.sub(r'\d+k', bitrate, http_url) + if not self._is_valid_url( + bitrate_url, video_name, '%s bitrate' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k', bitrate, http_url), + 'url': bitrate_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) @@ -282,7 +289,11 @@ class TEDIE(InfoExtractor): 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, - 'duration': talk_info.get('duration'), + 'duration': float_or_none(talk_info.get('duration')), + 'view_count': int_or_none(data.get('viewed_count')), + 'comment_count': int_or_none( + try_get(data, lambda x: x['comments']['count'])), + 'tags': try_get(talk_info, lambda x: x['tags'], list), } def _get_subtitles(self, video_id, talk_info): From ec240a43696478e43abb15e7c91f067b2bd5fe08 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 28 Jul 2018 20:29:56 +0100 Subject: [PATCH 103/111] [dailymotion:playlist] fix extraction(closes #16894) --- youtube_dl/extractor/dailymotion.py | 126 ++++++++++++++++++---------- 1 file changed, 84 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 8f5f57b98..040f0bd02 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 +import functools import hashlib import itertools import json @@ -16,11 +17,13 @@ from ..utils import ( error_to_compat_str, ExtractorError, int_or_none, + mimetype2ext, + OnDemandPagedList, parse_iso8601, sanitized_Request, str_to_int, unescapeHTML, - mimetype2ext, + urlencode_postdata, ) @@ -343,17 +346,93 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' - _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' - _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'info_dict': { 'title': 'SPORT', - 'id': 'xv4bw_nqtv_sport', + 'id': 'xv4bw', }, 'playlist_mincount': 20, }] + _PAGE_SIZE = 100 + + def _fetch_page(self, playlist_id, authorizaion, page): + page += 1 + videos = self._download_json( + 'https://graphql.api.dailymotion.com', + playlist_id, 'Downloading page %d' % page, + data=json.dumps({ + 'query': '''{ + collection(xid: "%s") { + videos(first: %d, page: %d) { + pageInfo { + hasNextPage + nextPage + } + edges { + node { + xid + url + } + } + } + } +}''' % (playlist_id, self._PAGE_SIZE, page) + }).encode(), headers={ + 'Authorization': authorizaion, + 'Origin': 'https://www.dailymotion.com', + })['data']['collection']['videos'] + for edge in videos['edges']: + node = edge['node'] + yield self.url_result( + node['url'], DailymotionIE.ie_key(), node['xid']) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + api = self._parse_json(self._search_regex( + r'__PLAYER_CONFIG__\s*=\s*({.+?});', + webpage, 'player config'), playlist_id)['context']['api'] + auth = self._download_json( + api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), + playlist_id, data=urlencode_postdata({ + 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), + 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), + 'grant_type': 'client_credentials', + })) + authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, + self._og_search_title(webpage)) + + +class DailymotionUserIE(DailymotionBaseInfoExtractor): + IE_NAME = 'dailymotion:user' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' + _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' + _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' + _TESTS = [{ + 'url': 'https://www.dailymotion.com/user/nqtv', + 'info_dict': { + 'id': 'nqtv', + 'title': 'Rémi Gaillard', + }, + 'playlist_mincount': 100, + }, { + 'url': 'http://www.dailymotion.com/user/UnderProject', + 'info_dict': { + 'id': 'UnderProject', + 'title': 'UnderProject', + }, + 'playlist_mincount': 1800, + 'expected_warnings': [ + 'Stopped at duplicated page', + ], + 'skip': 'Takes too long time', + }] def _extract_entries(self, id): video_ids = set() @@ -379,43 +458,6 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - webpage = self._download_webpage(url, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._og_search_title(webpage), - 'entries': self._extract_entries(playlist_id), - } - - -class DailymotionUserIE(DailymotionPlaylistIE): - IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' - _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' - _TESTS = [{ - 'url': 'https://www.dailymotion.com/user/nqtv', - 'info_dict': { - 'id': 'nqtv', - 'title': 'Rémi Gaillard', - }, - 'playlist_mincount': 100, - }, { - 'url': 'http://www.dailymotion.com/user/UnderProject', - 'info_dict': { - 'id': 'UnderProject', - 'title': 'UnderProject', - }, - 'playlist_mincount': 1800, - 'expected_warnings': [ - 'Stopped at duplicated page', - ], - 'skip': 'Takes too long time', - }] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') From 38e87f6c2ae2cdc04dd6f526213c83a0259db335 Mon Sep 17 00:00:00 2001 From: Huyuumi <zx.you.funy@gmail.com> Date: Sun, 29 Jul 2018 07:52:42 +0900 Subject: [PATCH 104/111] [utils] Remove return from __init__ --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b84436ed6..29cafd8f0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3569,7 +3569,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - return compat_urllib_request.ProxyHandler.__init__(self, proxies) + compat_urllib_request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') From 1a88fc5a69249a6c36ce5dbb6e5d251792ab6f39 Mon Sep 17 00:00:00 2001 From: bato3 <bato3@bandyci.org> Date: Sun, 29 Jul 2018 01:04:59 +0200 Subject: [PATCH 105/111] [ceskatelevize] Use https for API call (refs #16997) --- youtube_dl/extractor/ceskatelevize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6bad90859..46380430f 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -108,7 +108,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') From 4938c8d5730fd0db9eee977f94052c0058e407f0 Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano <gfabiano40@gmail.com> Date: Sun, 29 Jul 2018 01:24:10 +0200 Subject: [PATCH 106/111] [pornhub] Add support for subtitles (closes #16924) --- youtube_dl/extractor/pornhub.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 97f988da4..ffc4405a8 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -18,6 +18,7 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + url_or_none, ) @@ -68,6 +69,31 @@ class PornHubIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -139,12 +165,19 @@ class PornHubIE(InfoExtractor): video_urls = [] video_urls_set = set() + subtitles = {} flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) media_definitions = flashvars.get('mediaDefinitions') @@ -256,6 +289,7 @@ class PornHubIE(InfoExtractor): 'age_limit': 18, 'tags': tags, 'categories': categories, + 'subtitles': subtitles, } From b2286f8fb29a7be8eefe53e074df8ab1092a12d8 Mon Sep 17 00:00:00 2001 From: bato3 <bato3@bandyci.org> Date: Sun, 29 Jul 2018 01:56:52 +0200 Subject: [PATCH 107/111] [crunchyroll:playlist] Restrict _VALID_URL (closes #17069) --- youtube_dl/extractor/crunchyroll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 311da515d..463f995c7 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -262,6 +262,9 @@ class CrunchyrollIE(CrunchyrollBaseIE): # Just test metadata extraction 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/media-723735', + 'only_matching': True, }] _FORMAT_IDS = { @@ -580,7 +583,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', From 4eecef84f32869c25d56a4297a09b1b0b14a403e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Jul 2018 06:58:37 +0700 Subject: [PATCH 108/111] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 94ecaae8e..f9dd0a89f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Extractors +* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) ++ [pornhub] Add support for subtitles (#16924, #17088) +* [ceskatelevize] Use https for API call (#16997, #16999) +* [dailymotion:playlist] Fix extraction (#16894) +* [ted] Improve extraction +* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) +* [telecinco] Fix extraction (#17080) +* [mitele] Reduce number of requests +* [rai] Return non HTTP relinker URL intact (#17055) +* [vk] Fix extraction for inline only videos (#16923) +* [streamcloud] Fix extraction (#17054) +* [facebook] Fix tahoe player extraction with authentication (#16655) ++ [puhutv] Add support for puhutv.com (#12712, #16010, #16269) + + version 2018.07.21 Core From 548482841867a16d3f68e18f78091e59f768a880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Jul 2018 07:02:18 +0700 Subject: [PATCH 109/111] release 2018.07.29 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 24827ba8f..cae5fd749 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.21*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.21** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.29** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.07.21 +[debug] youtube-dl version 2018.07.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index f9dd0a89f..dfa27d3be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.29 Extractors * [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6cbe81802..f464d89db 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -672,6 +672,8 @@ - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital + - **puhutv** + - **puhutv:serie** - **Puls4** - **Pyvideo** - **qqmusic**: QQ音乐 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9bf0ea30d..2048b69d2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.21' +__version__ = '2018.07.29' From 9d1b213845f35af4de40dd057754f8f285091bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 30 Jul 2018 03:05:36 +0700 Subject: [PATCH 110/111] [viqeo] Add extractor (closes #17066) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 15 +++++ youtube_dl/extractor/viqeo.py | 99 ++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 youtube_dl/extractor/viqeo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 29fab5b9a..c7a91a986 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1291,6 +1291,7 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .viqeo import ViqeoIE from .viu import ( ViuIE, ViuPlaylistIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e5a8ffbe8..43218c3a4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -113,6 +113,7 @@ from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE from .apa import APAIE from .foxnews import FoxNewsIE +from .viqeo import ViqeoIE class GenericIE(InfoExtractor): @@ -2060,6 +2061,15 @@ class GenericIE(InfoExtractor): }, 'skip': 'TODO: fix nested playlists processing in tests', }, + { + # Viqeo embeds + 'url': 'https://viqeo.tv/', + 'info_dict': { + 'id': 'viqeo', + 'title': 'All-new video platform', + }, + 'playlist_count': 6, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -3094,6 +3104,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( sharevideos_urls, video_id, video_title) + viqeo_urls = ViqeoIE._extract_urls(webpage) + if viqeo_urls: + return self.playlist_from_matches( + viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/extractor/viqeo.py b/youtube_dl/extractor/viqeo.py new file mode 100644 index 000000000..be7dfa814 --- /dev/null +++ b/youtube_dl/extractor/viqeo.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, + url_or_none, +) + + +class ViqeoIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + viqeo:| + https?://cdn\.viqeo\.tv/embed/*\?.*?\bvid=| + https?://api\.viqeo\.tv/v\d+/data/startup?.*?\bvideo(?:%5B%5D|\[\])= + ) + (?P<id>[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837', + 'md5': 'a169dd1a6426b350dca4296226f21e76', + 'info_dict': { + 'id': 'cde96f09d25f39bee837', + 'ext': 'mp4', + 'title': 'cde96f09d25f39bee837', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 76, + }, + }, { + 'url': 'viqeo:cde96f09d25f39bee837', + 'only_matching': True, + }, { + 'url': 'https://api.viqeo.tv/v1/data/startup?video%5B%5D=71bbec412ade45c3216c&profile=112', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://cdn.viqeo.tv/embed/?vid=%s' % video_id, video_id) + + data = self._parse_json( + self._search_regex( + r'SLOT_DATA\s*=\s*({.+?})\s*;', webpage, 'slot data'), + video_id) + + formats = [] + thumbnails = [] + for media_file in data['mediaFiles']: + if not isinstance(media_file, dict): + continue + media_url = url_or_none(media_file.get('url')) + if not media_url or not media_url.startswith(('http', '//')): + continue + media_type = str_or_none(media_file.get('type')) + if not media_type: + continue + media_kind = media_type.split('/')[0].lower() + f = { + 'url': media_url, + 'width': int_or_none(media_file.get('width')), + 'height': int_or_none(media_file.get('height')), + } + format_id = str_or_none(media_file.get('quality')) + if media_kind == 'image': + f['id'] = format_id + thumbnails.append(f) + elif media_kind in ('video', 'audio'): + is_audio = media_kind == 'audio' + f.update({ + 'format_id': 'audio' if is_audio else format_id, + 'fps': int_or_none(media_file.get('fps')), + 'vcodec': 'none' if is_audio else None, + }) + formats.append(f) + self._sort_formats(formats) + + duration = int_or_none(data.get('duration')) + + return { + 'id': video_id, + 'title': video_id, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } From 7ff129d3ea6ec010493a3b98d960e943eda05595 Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano <gfabiano40@gmail.com> Date: Sun, 29 Jul 2018 22:15:06 +0200 Subject: [PATCH 111/111] [theplatform] Relax _VALID_URL (closes #16181) --- youtube_dl/extractor/theplatform.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 411b1f874..ffef5bf06 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -310,7 +310,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): class ThePlatformFeedIE(ThePlatformBaseIE): _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' - _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))' _TESTS = [{ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', @@ -327,6 +327,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], 'uploader': 'NBCU-NEWS', }, + }, { + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01', + 'only_matching': True, }] def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):