From 05e7c184da85f83b254bc3d138f89b11da802bdb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 2 Oct 2018 06:07:06 +0100 Subject: [PATCH 001/159] [hotstar] fix extraction in python 2(closes #17696) --- youtube_dl/extractor/hotstar.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 354ac00dc..bf5717f1b 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + try_get, ) @@ -72,7 +73,11 @@ class HotStarIE(HotStarBaseIE): app_state = self._parse_json(self._search_regex( r'', webpage, 'app state'), video_id) - video_data = list(app_state.values())[0]['initialState']['contentData']['content'] + video_data = {} + for v in app_state.values(): + content = try_get(v, lambda x: x['initialState']['contentData']['content'], dict) + if content and content.get('contentId') == video_id: + video_data = content title = video_data['title'] From d98cb62e552ee74079fda2e4173a40b14faac3fe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 2 Oct 2018 19:43:06 +0100 Subject: [PATCH 002/159] [crunchyroll] switch to HTTPS for RpcApi(closes #17749) --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index af786d096..045be0ab5 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -45,7 +45,7 @@ class CrunchyrollBaseIE(InfoExtractor): data['req'] = 'RpcApi' + method data = compat_urllib_parse_urlencode(data).encode('utf-8') return self._download_xml( - 'http://www.crunchyroll.com/xml/', + 'https://www.crunchyroll.com/xml/', video_id, note, fatal=False, data=data, headers={ 'Content-Type': 'application/x-www-form-urlencoded', }) From f60b9803a473da8e324313d01af91e5676792c77 Mon Sep 17 00:00:00 2001 From: Enes Date: Sat, 29 Sep 2018 13:28:56 +0300 Subject: [PATCH 003/159] [dailymotion] Fix extraction (closes #17699) --- youtube_dl/extractor/dailymotion.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 040f0bd02..842d9a259 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -24,6 +24,7 @@ from ..utils import ( str_to_int, unescapeHTML, urlencode_postdata, + try_get, ) @@ -172,7 +173,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) - metadata = player['metadata'] + metadata = try_get( + player, lambda x: x['metadata'], dict) or self._download_json( + 'http://www.dailymotion.com/player/metadata/video/%s' % video_id, video_id, query={ + 'integration': 'inline', + 'GK_PV5_NEON': '1', + }) if metadata.get('error', {}).get('type') == 'password_protected': password = self._downloader.params.get('videopassword') From 0082f44a08e33712fcd33ceabab15215c962eaac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Oct 2018 02:02:58 +0700 Subject: [PATCH 004/159] [dailymotion] Improve metadata extraction (closes #17706) --- youtube_dl/extractor/dailymotion.py | 32 ++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 842d9a259..1816c559e 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -22,9 +22,11 @@ from ..utils import ( parse_iso8601, sanitized_Request, str_to_int, - unescapeHTML, - urlencode_postdata, try_get, + unescapeHTML, + update_url_query, + url_or_none, + urlencode_postdata, ) @@ -172,15 +174,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): r'__PLAYER_CONFIG__\s*=\s*({.+?});'], webpage, 'player v5', default=None) if player_v5: - player = self._parse_json(player_v5, video_id) - metadata = try_get( - player, lambda x: x['metadata'], dict) or self._download_json( - 'http://www.dailymotion.com/player/metadata/video/%s' % video_id, video_id, query={ - 'integration': 'inline', - 'GK_PV5_NEON': '1', - }) + player = self._parse_json(player_v5, video_id, fatal=False) or {} + metadata = try_get(player, lambda x: x['metadata'], dict) + if not metadata: + metadata_url = url_or_none(try_get( + player, lambda x: x['context']['metadata_template_url1'])) + if metadata_url: + metadata_url = metadata_url.replace(':videoId', video_id) + else: + metadata_url = update_url_query( + 'https://www.dailymotion.com/player/metadata/video/%s' + % video_id, { + 'embedder': url, + 'integration': 'inline', + 'GK_PV5_NEON': '1', + }) + metadata = self._download_json( + metadata_url, video_id, 'Downloading metadata JSON') - if metadata.get('error', {}).get('type') == 'password_protected': + if try_get(metadata, lambda x: x['error']['type']) == 'password_protected': password = self._downloader.params.get('videopassword') if password: r = int(metadata['id'][1:], 36) From 21c1a00dd7dbb9f7551ca9809a194f6380dee7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Oct 2018 02:27:14 +0700 Subject: [PATCH 005/159] [pluralsight] Improve authentication (closes #17762) --- youtube_dl/extractor/pluralsight.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index daf172570..eafe56897 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -4,6 +4,7 @@ import collections import json import os import random +import re from .common import InfoExtractor from ..compat import ( @@ -196,7 +197,10 @@ query viewClip { if error: raise ExtractorError('Unable to login: %s' % error, expected=True) - if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): BLOCKED = 'Your account has been blocked due to suspicious activity' if BLOCKED in response: raise ExtractorError( From 2e7ed29e3429c20e735f3f7dfceb1b13cf757037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Oct 2018 02:29:52 +0700 Subject: [PATCH 006/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ChangeLog b/ChangeLog index 241712037..e2757f891 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +version + +Extractors +* [pluralsight] Improve authentication (#17762) +* [dailymotion] Fix extraction (#17699) +* [crunchyroll] Switch to HTTPS for RpcApi (#17749) ++ [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) +* [philharmoniedeparis] Fix extraction (#17705) ++ [jamendo] Add support for licensing.jamendo.com (#17724) ++ [openload] Add support for oload.cloud (#17710) +* [pluralsight] Fix subtitles extraction (#17726, #17728) ++ [vimeo] Add another config regular expression (#17690) +* [spike] Fix Paramount Network extraction (#17677) +* [hotstar] Fix extraction (#14694, #14931, #17637) + + version 2018.09.26 Extractors From d96f976b0c36f65894380d3d831b0520d6260c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Oct 2018 02:31:30 +0700 Subject: [PATCH 007/159] release 2018.10.05 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ed3e0a157..058eb4321 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.10.05*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.10.05** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.09.26 +[debug] youtube-dl version 2018.10.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e2757f891..86cf489b1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.10.05 Extractors * [pluralsight] Improve authentication (#17762) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 736ab6da7..f167a6ddc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -360,7 +360,7 @@ - **HitRecord** - **HornBunny** - **HotNewHipHop** - - **HotStar** + - **hotstar** - **hotstar:playlist** - **Howcast** - **HowStuffWorks** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6f2cc31df..7d3f25019 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.09.26' +__version__ = '2018.10.05' From c9d891f19a923f53132b49a1f5b97f344d92503c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Oct 2018 20:11:01 +0100 Subject: [PATCH 008/159] [patreon] fix extraction(closes #14502)(closes #10471) --- youtube_dl/extractor/patreon.py | 160 ++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 9eb027679..6f73ed68d 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -2,52 +2,63 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) class PatreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P[^&#]+)' - _TESTS = [ - { - 'url': 'http://www.patreon.com/creation?hid=743933', - 'md5': 'e25505eec1053a6e6813b8ed369875cc', - 'info_dict': { - 'id': '743933', - 'ext': 'mp3', - 'title': 'Episode 166: David Smalley of Dogma Debate', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P\d+)' + _TESTS = [{ + 'url': 'http://www.patreon.com/creation?hid=743933', + 'md5': 'e25505eec1053a6e6813b8ed369875cc', + 'info_dict': { + 'id': '743933', + 'ext': 'mp3', + 'title': 'Episode 166: David Smalley of Dogma Debate', + 'description': 'md5:713b08b772cd6271b9f3906683cfacdf', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + 'timestamp': 1406473987, + 'upload_date': '20140727', }, - { - 'url': 'http://www.patreon.com/creation?hid=754133', - 'md5': '3eb09345bf44bf60451b8b0b81759d0a', - 'info_dict': { - 'id': '754133', - 'ext': 'mp3', - 'title': 'CD 167 Extra', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + }, { + 'url': 'http://www.patreon.com/creation?hid=754133', + 'md5': '3eb09345bf44bf60451b8b0b81759d0a', + 'info_dict': { + 'id': '754133', + 'ext': 'mp3', + 'title': 'CD 167 Extra', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', }, - { - 'url': 'https://www.patreon.com/creation?hid=1682498', - 'info_dict': { - 'id': 'SU4fj_aEMVw', - 'ext': 'mp4', - 'title': 'I\'m on Patreon!', - 'uploader': 'TraciJHines', - 'thumbnail': 're:^https?://.*$', - 'upload_date': '20150211', - 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', - 'uploader_id': 'TraciJHines', - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - } + 'skip': 'Patron-only content', + }, { + 'url': 'https://www.patreon.com/creation?hid=1682498', + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'uploader': 'TraciJHines', + 'thumbnail': 're:^https?://.*$', + 'upload_date': '20150211', + 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', + 'uploader_id': 'TraciJHines', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, } - ] + }, { + 'url': 'https://www.patreon.com/posts/episode-166-of-743933', + 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/posts/743933', + 'only_matching': True, + }] # Currently Patreon exposes download URL via hidden CSS, so login is not # needed. Keeping this commented for when this inevitably changes. @@ -78,38 +89,43 @@ class PatreonIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage).strip() - - attach_fn = self._html_search_regex( - r'
', - webpage, 'attachment URL', default=None) - embed = self._html_search_regex( - r']+id="watchCreation"[^>]*>\s*]+src="([^"]+)"', - webpage, 'embedded URL', default=None) - - if attach_fn is not None: - video_url = 'http://www.patreon.com' + attach_fn - thumbnail = self._og_search_thumbnail(webpage) - uploader = self._html_search_regex( - r'(.*?) is creating', webpage, 'uploader') - elif embed is not None: - return self.url_result(embed) - else: - playlist = self._parse_json(self._search_regex( - r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])', - webpage, 'playlist JSON'), - video_id, transform_source=js_to_json) - data = playlist[0] - video_url = self._proto_relative_url(data['mp3']) - thumbnail = self._proto_relative_url(data.get('cover')) - uploader = data.get('artist') - - return { + post = self._download_json( + 'https://www.patreon.com/api/posts/' + video_id, video_id) + attributes = post['data']['attributes'] + title = attributes['title'].strip() + image = attributes.get('image') or {} + info = { 'id': video_id, - 'url': video_url, - 'ext': 'mp3', 'title': title, - 'uploader': uploader, - 'thumbnail': thumbnail, + 'description': clean_html(attributes.get('content')), + 'thumbnail': image.get('large_url') or image.get('url'), + 'timestamp': parse_iso8601(attributes.get('published_at')), + 'like_count': int_or_none(attributes.get('like_count')), + 'comment_count': int_or_none(attributes.get('comment_count')), } + + for i in post.get('included', []): + i_type = i.get('type') + if i_type == 'attachment': + attachment_attributes = i.get('attributes') or {} + attachment_url = attachment_attributes.get('url') + if attachment_url: + info.update({ + 'url': attachment_url, + 'ext': determine_ext(attachment_attributes.get('name'), 'mp3'), + }) + elif i_type == 'user': + user_attributes = i.get('attributes') + if user_attributes: + info.update({ + 'uploader': user_attributes.get('full_name'), + 'uploader_url': user_attributes.get('url'), + }) + + if not info.get('url'): + info.update({ + '_type': 'url', + 'url': attributes['embed']['url'], + }) + + return info From 19a352854f5143b7cd120e990433d0fd40f617b0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Oct 2018 22:45:04 +0100 Subject: [PATCH 009/159] [patreon] extract post_file url(#17792) --- youtube_dl/extractor/patreon.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 6f73ed68d..426dd8121 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -104,16 +104,18 @@ class PatreonIE(InfoExtractor): 'comment_count': int_or_none(attributes.get('comment_count')), } + def add_file(file_data): + file_url = file_data.get('url') + if file_url: + info.update({ + 'url': file_url, + 'ext': determine_ext(file_data.get('name'), 'mp3'), + }) + for i in post.get('included', []): i_type = i.get('type') if i_type == 'attachment': - attachment_attributes = i.get('attributes') or {} - attachment_url = attachment_attributes.get('url') - if attachment_url: - info.update({ - 'url': attachment_url, - 'ext': determine_ext(attachment_attributes.get('name'), 'mp3'), - }) + add_file(i.get('attributes') or {}) elif i_type == 'user': user_attributes = i.get('attributes') if user_attributes: @@ -122,6 +124,9 @@ class PatreonIE(InfoExtractor): 'uploader_url': user_attributes.get('url'), }) + if not info.get('url'): + add_file(attributes.get('post_file') or {}) + if not info.get('url'): info.update({ '_type': 'url', From 5d90a8a5f3fef73bb4ccbecd8c61583522b88d79 Mon Sep 17 00:00:00 2001 From: yonaikerlol <39972049+yonaikerlol@users.noreply.github.com> Date: Sun, 7 Oct 2018 09:05:45 -0400 Subject: [PATCH 010/159] [openload] Add support for oload.cc --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index dc01b6346..c652603a5 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -314,6 +314,9 @@ class OpenloadIE(InfoExtractor): # Its title has not got its extension but url has it 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', 'only_matching': True, + }, { + 'url': 'https://oload.cc/embed/5NEAbI2BDSk', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From a94e7c195e261137461b546c6446033b371dfbbe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Oct 2018 11:51:40 +0100 Subject: [PATCH 011/159] [ted] fix extraction for http and rtmp formats(closes #5941)(closes #17572)(closes #17894) --- youtube_dl/extractor/ted.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 212ac80ab..f9b6aa48f 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -212,8 +212,6 @@ class TEDIE(InfoExtractor): http_url = None for format_id, resources in resources_.items(): - if not isinstance(resources, dict): - continue if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -242,6 +240,8 @@ class TEDIE(InfoExtractor): 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': + if not isinstance(resources, dict): + continue stream_url = url_or_none(resources.get('stream')) if not stream_url: continue From f0ee386851bb0d53801a27dafbe4e8fee5b43d88 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Oct 2018 16:26:29 +0100 Subject: [PATCH 012/159] [tv3] remove extractor(closes #10461)(closes #15339) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/tv3.py | 34 ------------------------------ 2 files changed, 35 deletions(-) delete mode 100644 youtube_dl/extractor/tv3.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 464c8d690..17b576df3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1153,7 +1153,6 @@ from .tv2 import ( TV2ArticleIE, ) from .tv2hu import TV2HuIE -from .tv3 import TV3IE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE from .tva import TVAIE diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py deleted file mode 100644 index 3867ec90d..000000000 --- a/youtube_dl/extractor/tv3.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class TV3IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx' - _TEST = { - 'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx', - 'info_dict': { - 'id': '4659127992001', - 'ext': 'mp4', - 'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3', - 'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.', - 'uploader_id': '3812193411001', - 'upload_date': '20151213', - 'timestamp': 1449975272, - }, - 'expected_warnings': [ - 'Failed to download MPD manifest' - ], - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - brightcove_id = self._search_regex(r' Date: Mon, 15 Oct 2018 17:54:38 +0100 Subject: [PATCH 013/159] [brightcove:legacy] fall back to brightcove:new(#13912) --- youtube_dl/extractor/brightcove.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 14f9a14ed..5dbd71e12 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -356,7 +356,9 @@ class BrightcoveLegacyIE(InfoExtractor): def _extract_video_info(self, video_info): video_id = compat_str(video_info['id']) + publisher_id = video_info.get('publisherId') + info = { 'id': video_id, 'title': video_info['displayName'].strip(), @@ -444,8 +446,16 @@ class BrightcoveLegacyIE(InfoExtractor): else: return ad_info - if 'url' not in info and not info.get('formats'): - raise ExtractorError('Unable to extract video url for %s' % video_id) + if not info.get('url') and not info.get('formats'): + uploader_id = info.get('uploader_id') + if uploader_id: + info.update({ + '_type': 'url', + 'url': 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (uploader_id, video_id), + 'ie_key': BrightcoveNewIE.ie_key(), + }) + else: + raise ExtractorError('Unable to extract video url for %s' % video_id) return info From 160c2773f63c72686635533bc2553634b22e7e2e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Oct 2018 18:41:57 +0100 Subject: [PATCH 014/159] [brightcove:legacy] add another fall back to brightcove:new --- youtube_dl/extractor/brightcove.py | 39 ++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 5dbd71e12..40c3959fd 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,8 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import base64 import json +import re +import struct from .common import InfoExtractor from .adobepass import AdobePassIE @@ -310,6 +312,10 @@ class BrightcoveLegacyIE(InfoExtractor): 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) + def _brightcove_new_url_result(self, publisher_id, video_id): + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + def _get_video_info(self, video_id, query, referer=None): headers = {} linkBase = query.get('linkBaseURL') @@ -323,6 +329,29 @@ class BrightcoveLegacyIE(InfoExtractor): r"

We're sorry.

([\s\n]*

.*?

)+", webpage, 'error message', default=None) if error_msg is not None: + publisher_id = query.get('publisherId') + if publisher_id and publisher_id[0].isdigit(): + publisher_id = publisher_id[0] + if not publisher_id: + valid_key = lambda key: key and ',' in key + player_key = query.get('playerKey') + if player_key and ',' in player_key[0]: + player_key = player_key[0] + else: + player_id = query.get('playerID') + if player_id and player_id[0].isdigit(): + player_page = self._download_webpage( + 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + video_id, headers=headers, fatal=False) + if player_page: + player_key = self._search_regex( + r'Q', base64.urlsafe_b64decode(enc_pub_id))[0] + if publisher_id: + return self._brightcove_new_url_result(publisher_id, video_id) raise ExtractorError( 'brightcove said: %s' % error_msg, expected=True) @@ -356,9 +385,7 @@ class BrightcoveLegacyIE(InfoExtractor): def _extract_video_info(self, video_info): video_id = compat_str(video_info['id']) - publisher_id = video_info.get('publisherId') - info = { 'id': video_id, 'title': video_info['displayName'].strip(), @@ -449,11 +476,7 @@ class BrightcoveLegacyIE(InfoExtractor): if not info.get('url') and not info.get('formats'): uploader_id = info.get('uploader_id') if uploader_id: - info.update({ - '_type': 'url', - 'url': 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (uploader_id, video_id), - 'ie_key': BrightcoveNewIE.ie_key(), - }) + info.update(self._brightcove_new_url_result(uploader_id, video_id)) else: raise ExtractorError('Unable to extract video url for %s' % video_id) return info From 582797d780b6f4857a5b8b6ca8c63915242c0ab9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Oct 2018 20:47:12 +0100 Subject: [PATCH 015/159] [brightcove] remove unused variable --- youtube_dl/extractor/brightcove.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 40c3959fd..465ae396e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -333,7 +333,6 @@ class BrightcoveLegacyIE(InfoExtractor): if publisher_id and publisher_id[0].isdigit(): publisher_id = publisher_id[0] if not publisher_id: - valid_key = lambda key: key and ',' in key player_key = query.get('playerKey') if player_key and ',' in player_key[0]: player_key = player_key[0] From baeabf77428ad1a6bd5a910e7be07100fcb1eadd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Oct 2018 23:19:44 +0700 Subject: [PATCH 016/159] [rutube] Use geo verification headers (closes #17897) --- youtube_dl/extractor/rutube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 261bcbb83..10ac8ed1f 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -103,7 +103,8 @@ class RutubeIE(RutubeBaseIE): options = self._download_json( 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, - video_id, 'Downloading options JSON') + video_id, 'Downloading options JSON', + headers=self.geo_verification_headers()) formats = [] for format_id, format_url in options['video_balancer'].items(): From b99b0bcfa079a15a988cf931a3ce44bb480dfbdb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 17 Oct 2018 06:22:07 +0100 Subject: [PATCH 017/159] [cwtv] handle api errors(closes #17905) --- youtube_dl/extractor/cwtv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index 224a1fb5d..f9bd535f6 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, parse_age_limit, parse_iso8601, @@ -66,9 +67,12 @@ class CWTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( + data = self._download_json( 'http://images.cwtv.com/feed/mobileapp/video-meta/apiversion_8/guid_' + video_id, - video_id)['video'] + video_id) + if data.get('result') != 'ok': + raise ExtractorError(data['msg'], expected=True) + video_data = data['video'] title = video_data['title'] mpx_url = video_data.get('mpx_url') or 'http://link.theplatform.com/s/cwtv/media/guid/2703454149/%s?formats=M3U' % video_id From 7d9e858132cd28b975d2174f5836d3de03f741d1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Oct 2018 05:40:49 +0100 Subject: [PATCH 018/159] [viewster] reduce format requests --- youtube_dl/extractor/viewster.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index d5d5b4c69..6e318479c 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -130,16 +130,16 @@ class ViewsterIE(InfoExtractor): def concat(suffix, sep='-'): return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix - for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): - media = self._download_json( - 'https://public-api.viewster.com/movies/%s/video' % entry_id, - video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={ - 'mediaType': media_type, - 'language': audio, - 'subtitle': subtitle, - }) - if not media: - continue + medias = self._download_json( + 'https://public-api.viewster.com/movies/%s/videos' % entry_id, + video_id, fatal=False, query={ + 'mediaTypes': ['application/f4m+xml', 'application/x-mpegURL', 'video/mp4'], + 'language': audio, + 'subtitle': subtitle, + }) + if not medias: + continue + for media in medias: video_url = media.get('Uri') if not video_url: continue From 5e733b066a5fca7fe91ad5800a3e83f0a49c8fbd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Oct 2018 05:41:57 +0100 Subject: [PATCH 019/159] [dailymail] fix format extraction(closes #17976) --- youtube_dl/extractor/dailymail.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index af3978035..4f75a2a30 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -49,6 +49,9 @@ class DailyMailIE(InfoExtractor): 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) video_sources = self._download_json(sources_url, video_id) + body = video_sources.get('body') + if body: + video_sources = body formats = [] for rendition in video_sources['renditions']: From 08c7d3dadec053ff5535ab2dc91f550ef4788297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Oct 2018 22:12:54 +0700 Subject: [PATCH 020/159] [crunchyroll] Improve extraction failsafeness (closes #17991) --- youtube_dl/extractor/crunchyroll.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 045be0ab5..4a68d092b 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re import json +import xml.etree.ElementTree as etree import zlib from hashlib import sha1 @@ -398,7 +399,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'Downloading subtitles for ' + sub_name, data={ 'subtitle_script_id': sub_id, }) - if sub_doc is None: + if not isinstance(sub_doc, etree.Element): continue sid = sub_doc.get('id') iv = xpath_text(sub_doc, 'iv', 'subtitle iv') @@ -515,7 +516,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_quality': stream_quality, 'current_page': url, }) - if streamdata is not None: + if isinstance(streamdata, etree.Element): stream_info = streamdata.find('./{default}preload/stream_info') if stream_info is not None: stream_infos.append(stream_info) @@ -526,7 +527,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_format': stream_format, 'video_encode_quality': stream_quality, }) - if stream_info is not None: + if isinstance(stream_info, etree.Element): stream_infos.append(stream_info) for stream_info in stream_infos: video_encode_id = xpath_text(stream_info, './video_encode_id') @@ -598,10 +599,22 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text series = self._html_search_regex( r'(?s)]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)]+id=["\']showmedia_about_episode_num[^>]+>.+?\s*

\s*Season (\d+)', @@ -611,8 +624,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'id': video_id, 'title': video_title, 'description': video_description, - 'duration': float_or_none(media_metadata.get('duration'), 1000), - 'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'), + 'duration': duration, + 'thumbnail': thumbnail, 'uploader': video_uploader, 'upload_date': video_upload_date, 'series': series, From 022218f2f0dce112c0e2b15923c3a368bdfe4d6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Oct 2018 22:49:10 +0700 Subject: [PATCH 021/159] [ivi] Add support for ivi.tv --- youtube_dl/extractor/ivi.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index cb51cef2d..86c014b07 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -15,7 +15,7 @@ from ..utils import ( class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' - _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] @@ -65,7 +65,11 @@ class IviIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', - } + }, + { + 'url': 'https://www.ivi.tv/watch/33560/', + 'only_matching': True, + }, ] # Sorted by quality From c901cc38e50a47c9659db534e637ac4f6a54c450 Mon Sep 17 00:00:00 2001 From: yonaikerlol Date: Sun, 28 Oct 2018 11:51:29 -0400 Subject: [PATCH 022/159] [openload] Add support for oload.icu --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index c652603a5..a91f29f5c 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -317,6 +317,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.cc/embed/5NEAbI2BDSk', 'only_matching': True, + }, { + 'url': 'https://oload.icu/f/-_i4y_F_Hs8', + 'only_matching': True }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From 1fafb329849e3f07e6a6e4141bcd4547e141745c Mon Sep 17 00:00:00 2001 From: sichuan-pepper Date: Sun, 28 Oct 2018 01:46:32 +0900 Subject: [PATCH 023/159] [screencast] Fix extraction (closes #14590) --- youtube_dl/extractor/screencast.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 62a6a8337..c6554c905 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -90,6 +90,14 @@ class ScreencastIE(InfoExtractor): r'src=(.*?)(?:$|&)', video_meta, 'meta tag video URL', default=None) + if video_url is None: + video_url = self._html_search_regex( + r'"MediaContentUrl":"([^"]+)"', webpage, 'media content url', default=None) + + if video_url is None: + video_url = self._html_search_meta( + 'og:video', webpage, default=None) + if video_url is None: raise ExtractorError('Cannot find video') From a1d1c63678dcb075a8e741947c41abfee6c790a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Oct 2018 23:23:32 +0700 Subject: [PATCH 024/159] [screencast] Improve extraction (closes #14617, closes #17990) --- youtube_dl/extractor/screencast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index c6554c905..69a0d01f3 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -92,7 +92,8 @@ class ScreencastIE(InfoExtractor): if video_url is None: video_url = self._html_search_regex( - r'"MediaContentUrl":"([^"]+)"', webpage, 'media content url', default=None) + r'MediaContentUrl["\']\s*:(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'video url', default=None, group='url') if video_url is None: video_url = self._html_search_meta( From 4c237ab78768972e4d61d0b97fe9078d95dc4433 Mon Sep 17 00:00:00 2001 From: Alexey Trofimov Date: Fri, 26 Oct 2018 15:00:55 +0700 Subject: [PATCH 025/159] [sportbox] Fix extraction --- youtube_dl/extractor/sportbox.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 54497c880..9413cf27a 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -18,7 +18,7 @@ class SportBoxEmbedIE(InfoExtractor): 'info_dict': { 'id': '211355', 'ext': 'mp4', - 'title': '211355', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 292, 'view_count': int, @@ -48,9 +48,18 @@ class SportBoxEmbedIE(InfoExtractor): wjplayer_data = self._parse_json( self._search_regex( - r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'), + r'(?s)var\s+playerOptions\s*=\s*({.+?});', webpage, 'wjplayer settings'), video_id, transform_source=js_to_json) + wjplayer_data['sources'] = self._parse_json( + self._search_regex( + r'(?s)playerOptions\.sources\s*=\s*(\[.+?\]);', webpage, 'wjplayer sources'), + video_id, transform_source=js_to_json) + + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'(.+?)', webpage, 'title', fatal=False) or video_id + formats = [] for source in wjplayer_data['sources']: src = source.get('src') @@ -71,7 +80,7 @@ class SportBoxEmbedIE(InfoExtractor): return { 'id': video_id, - 'title': video_id, + 'title': title, 'thumbnail': wjplayer_data.get('poster'), 'duration': int_or_none(wjplayer_data.get('duration')), 'view_count': view_count, From bebef109092ba2ad1b08619661aa1b51e65be1bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Oct 2018 00:19:08 +0700 Subject: [PATCH 026/159] [extractor/common] Add validation for JSON-LD URLs --- youtube_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2dbf81e6e..8452125c8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -69,6 +69,7 @@ from ..utils import ( update_url_query, urljoin, url_basename, + url_or_none, xpath_element, xpath_text, xpath_with_ns, @@ -1213,10 +1214,10 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ - 'url': e.get('contentUrl'), + 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), From 476cf548e1c6aa83686150db7abf625c6237a67f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Oct 2018 00:20:29 +0700 Subject: [PATCH 027/159] [sportbox] Improve extraction, add support for matchtv.ru and fix video id (closes #17978) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/generic.py | 6 ++-- youtube_dl/extractor/sportbox.py | 55 ++++++++++++++++++------------ 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 17b576df3..f013d13c3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1043,7 +1043,7 @@ from .spike import ( ) from .stitcher import StitcherIE from .sport5 import Sport5IE -from .sportbox import SportBoxEmbedIE +from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2a48667f0..545e03371 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,7 +47,7 @@ from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE -from .sportbox import SportBoxEmbedIE +from .sportbox import SportBoxIE from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE @@ -2636,9 +2636,9 @@ class GenericIE(InfoExtractor): return self.url_result(tvc_url, 'TVC') # Look for embedded SportBox player - sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) + sportbox_urls = SportBoxIE._extract_urls(webpage) if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 9413cf27a..b9017fd2a 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -8,20 +8,24 @@ from ..utils import ( determine_ext, int_or_none, js_to_json, + merge_dicts, ) -class SportBoxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P\d+)' +class SportBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P\d+)' _TESTS = [{ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { - 'id': '211355', + 'id': '109158', 'ext': 'mp4', 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 292, 'view_count': int, + 'timestamp': 1426237001, + 'upload_date': '20150313', }, 'params': { # m3u8 download @@ -33,12 +37,18 @@ class SportBoxEmbedIE(InfoExtractor): }, { 'url': 'https://news.sportbox.ru/vdl/player/media/193095', 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/109158', + 'only_matching': True, + }, { + 'url': 'https://matchtv.ru/vdl/player/media/109158', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', + r']+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', webpage) def _real_extract(self, url): @@ -46,22 +56,14 @@ class SportBoxEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - wjplayer_data = self._parse_json( + sources = self._parse_json( self._search_regex( - r'(?s)var\s+playerOptions\s*=\s*({.+?});', webpage, 'wjplayer settings'), + r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n', + webpage, 'sources'), video_id, transform_source=js_to_json) - wjplayer_data['sources'] = self._parse_json( - self._search_regex( - r'(?s)playerOptions\.sources\s*=\s*(\[.+?\]);', webpage, 'wjplayer sources'), - video_id, transform_source=js_to_json) - - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage) or self._html_search_regex( - r'(.+?)', webpage, 'title', fatal=False) or video_id - formats = [] - for source in wjplayer_data['sources']: + for source in sources: src = source.get('src') if not src: continue @@ -75,14 +77,23 @@ class SportBoxEmbedIE(InfoExtractor): }) self._sort_formats(formats) + player = self._parse_json( + self._search_regex( + r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage, + 'player options', default='{}'), + video_id, transform_source=js_to_json) + media_id = player['mediaId'] + + info = self._search_json_ld(webpage, media_id, default={}) + view_count = int_or_none(self._search_regex( r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) - return { - 'id': video_id, - 'title': title, - 'thumbnail': wjplayer_data.get('poster'), - 'duration': int_or_none(wjplayer_data.get('duration')), + return merge_dicts(info, { + 'id': media_id, + 'title': self._og_search_title(webpage, default=None) or media_id, + 'thumbnail': player.get('poster'), + 'duration': int_or_none(player.get('duration')), 'view_count': view_count, 'formats': formats, - } + }) From c2fe21efaaf5be47da9d88bb2a490c688bc920f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Oct 2018 00:38:06 +0700 Subject: [PATCH 028/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 86cf489b1..a21177dac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version + +Core ++ [extractor/common] Add validation for JSON-LD URLs + +Extractors ++ [sportbox] Add support for matchtv.ru +* [sportbox] Fix extraction (#17978) +* [screencast] Fix extraction (#14590, #14617, #17990) ++ [openload] Add support for oload.icu ++ [ivi] Add support for ivi.tv +* [crunchyroll] Improve extraction failsafeness (#17991) +* [dailymail] Fix formats extraction (#17976) +* [viewster] Reduce format requests +* [cwtv] Handle API errors (#17905) ++ [rutube] Use geo verification headers (#17897) ++ [brightcove:legacy] Add fallbacks to brightcove:new (#13912) +- [tv3] Remove extractor (#10461, #15339) +* [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) ++ [openload] Add support for oload.cc (#17823) ++ [patreon] Extract post_file URL (#17792) +* [patreon] Fix extraction (#14502, #10471) + + version 2018.10.05 Extractors From 9ff558f67f2285a17d2a4214b5f74aeb6ce4d9b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Oct 2018 00:39:29 +0700 Subject: [PATCH 029/159] release 2018.10.29 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +-- youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 058eb4321..aefed163a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.10.05*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.10.05** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.10.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.10.29** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.10.05 +[debug] youtube-dl version 2018.10.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a21177dac..57dbde12d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.10.29 Core + [extractor/common] Add validation for JSON-LD URLs diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f167a6ddc..e5a6879bc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -818,7 +818,7 @@ - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - - **SportBoxEmbed** + - **SportBox** - **SportDeutschland** - **SpringboardPlatform** - **Sprout** @@ -909,7 +909,6 @@ - **TV2** - **tv2.hu** - **TV2Article** - - **TV3** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7d3f25019..ae9a77966 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.10.05' +__version__ = '2018.10.29' From 9c4a83a1bec9d7abd066a89be40e62dd36ffa67d Mon Sep 17 00:00:00 2001 From: Ali Irani Date: Thu, 6 Sep 2018 02:08:38 +0430 Subject: [PATCH 030/159] [aparat] Fix extraction --- youtube_dl/extractor/aparat.py | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 6eb8bbb6e..780439e17 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -34,32 +34,32 @@ class AparatIE(InfoExtractor): 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, video_id) - title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') - file_list = self._parse_json( self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, + r'var options\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) + title = file_list['plugins']['sabaPlayerPlugin']['title'] + formats = [] - for item in file_list[0]: - file_url = url_or_none(item.get('file')) - if not file_url: - continue - ext = mimetype2ext(item.get('type')) - label = item.get('label') - formats.append({ - 'url': file_url, - 'ext': ext, - 'format_id': label or ext, - 'height': int_or_none(self._search_regex( - r'(\d+)[pP]', label or '', 'height', default=None)), - }) + for list in file_list['plugins']['sabaPlayerPlugin']['multiSRC']: + for item in list: + file_url = url_or_none(item.get('src')) + if not file_url: + continue + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': label or ext, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', default=None)), + }) self._sort_formats(formats) - thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) + thumbnail = file_list['poster'] return { 'id': video_id, From 2943397e8701d3dcd28433e485e50459fdbda62a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Oct 2018 23:29:05 +0700 Subject: [PATCH 031/159] [aparat] Improve extraction and extract more metadata (closes #17445, closes #18008) --- youtube_dl/extractor/aparat.py | 89 ++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 780439e17..883dcee7a 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + merge_dicts, mimetype2ext, url_or_none, ) @@ -12,59 +13,83 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', 'title': 'تیم گلکسی 11 - زومیت', - 'age_limit': 0, + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', + 'duration': 231, + 'timestamp': 1387394859, + 'upload_date': '20131218', + 'view_count': int, }, - # 'skip': 'Extremely unreliable', - } + }, { + # multiple formats + 'url': 'https://www.aparat.com/v/8dflw/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work - webpage = self._download_webpage( - 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) + # Provides more metadata + webpage = self._download_webpage(url, video_id, fatal=False) - file_list = self._parse_json( + if not webpage: + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) + + options = self._parse_json( self._search_regex( - r'var options\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, - 'file list'), + r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', + webpage, 'options', group='value'), video_id) - title = file_list['plugins']['sabaPlayerPlugin']['title'] + player = options['plugins']['sabaPlayerPlugin'] formats = [] - for list in file_list['plugins']['sabaPlayerPlugin']['multiSRC']: - for item in list: + for sources in player['multiSRC']: + for item in sources: + if not isinstance(item, dict): + continue file_url = url_or_none(item.get('src')) if not file_url: continue - ext = mimetype2ext(item.get('type')) - label = item.get('label') - formats.append({ - 'url': file_url, - 'ext': ext, - 'format_id': label or ext, - 'height': int_or_none(self._search_regex( - r'(\d+)[pP]', label or '', 'height', default=None)), - }) - self._sort_formats(formats) + item_type = item.get('type') + if item_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': 'http-%s' % (label or ext), + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', + default=None)), + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) - thumbnail = file_list['poster'] + info = self._search_json_ld(webpage, video_id, default={}) - return { + if not info.get('title'): + info['title'] = player['title'] + + return merge_dicts(info, { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': self._family_friendly_search(webpage), + 'thumbnail': url_or_none(options.get('poster')), + 'duration': int_or_none(player.get('duration')), 'formats': formats, - } + }) From ffa7b2bfee7b94191ffc20ef00c22f708c97cddf Mon Sep 17 00:00:00 2001 From: gfabiano Date: Mon, 30 Jul 2018 18:15:20 +0200 Subject: [PATCH 032/159] [cbnc] Add support for new URL schema (closes #14193) --- youtube_dl/extractor/cnbc.py | 41 +++++++++++++++++++++++++++++- youtube_dl/extractor/extractors.py | 5 +++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index d354d9f95..35c0b6124 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -1,8 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + js_to_json, + smuggle_url, +) class CNBCIE(InfoExtractor): @@ -34,3 +38,38 @@ class CNBCIE(InfoExtractor): {'force_smil_url': True}), 'id': video_id, } + + +class CNBCNewIE(InfoExtractor): + IE_NAME = 'CNBC:new' + _VALID_URL = r'https?://(?:www)?\.cnbc\.com/video.*/(?P[^.]+)' + _TEST = { + 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'info_dict': { + 'id': '7000031301', + 'ext': 'mp4', + 'title': 'Trump: I don\'t necessarily agree with raising rates', + 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', + 'timestamp': 1531958400, + 'upload_date': '20180719', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + CNBC_URL_TEMPLATE = 'http://video.cnbc.com/gallery/?video=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._parse_json( + self._search_regex( + r'(?s).*]*>.*?({.+?content_id.+?}).*?', + webpage, display_id), + display_id, transform_source=js_to_json + )['content_id'] + + return self.url_result(self.CNBC_URL_TEMPLATE % video_id, 'CNBC') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f013d13c3..93574907b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -209,7 +209,10 @@ from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE -from .cnbc import CNBCIE +from .cnbc import ( + CNBCIE, + CNBCNewIE, +) from .cnn import ( CNNIE, CNNBlogsIE, From 94db1f7f3b7269d5843b815ef2aa5b71d0361e6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Oct 2018 23:53:39 +0700 Subject: [PATCH 033/159] [cnbc] Simplify extraction (closes #14280, closes #17110) --- youtube_dl/extractor/cnbc.py | 29 ++++++++++------------------- youtube_dl/extractor/extractors.py | 2 +- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index 35c0b6124..81b0c9fc4 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -3,10 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - js_to_json, - smuggle_url, -) +from ..utils import smuggle_url class CNBCIE(InfoExtractor): @@ -40,36 +37,30 @@ class CNBCIE(InfoExtractor): } -class CNBCNewIE(InfoExtractor): - IE_NAME = 'CNBC:new' - _VALID_URL = r'https?://(?:www)?\.cnbc\.com/video.*/(?P[^.]+)' +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www)?\.cnbc\.com/video/(?:[^/]+/)+(?P[^./?#&]+)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { 'id': '7000031301', 'ext': 'mp4', - 'title': 'Trump: I don\'t necessarily agree with raising rates', + 'title': "Trump: I don't necessarily agree with raising rates", 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', 'timestamp': 1531958400, 'upload_date': '20180719', 'uploader': 'NBCU-CNBC', }, 'params': { - # m3u8 download 'skip_download': True, }, } - CNBC_URL_TEMPLATE = 'http://video.cnbc.com/gallery/?video=%s' - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._parse_json( - self._search_regex( - r'(?s).*]*>.*?({.+?content_id.+?}).*?', - webpage, display_id), - display_id, transform_source=js_to_json - )['content_id'] - - return self.url_result(self.CNBC_URL_TEMPLATE % video_id, 'CNBC') + video_id = self._search_regex( + r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, + 'video id') + return self.url_result( + 'http://video.cnbc.com/gallery/?video=%s' % video_id, + CNBCIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 93574907b..d96e23905 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -211,7 +211,7 @@ from .clyp import ClypIE from .cmt import CMTIE from .cnbc import ( CNBCIE, - CNBCNewIE, + CNBCVideoIE, ) from .cnn import ( CNNIE, From 9aac22c195cf41ff46a644bd027481629d0e6d06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 30 Oct 2018 00:22:18 +0700 Subject: [PATCH 034/159] [theplatform] Improve error detection (#13222) --- youtube_dl/extractor/theplatform.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index ffef5bf06..181620615 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -39,9 +39,17 @@ class ThePlatformBaseIE(OnceIE): smil_url, video_id, note=note, query={'format': 'SMIL'}, headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') - if error_element is not None and error_element.attrib['src'].startswith( - 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD): - raise ExtractorError(error_element.attrib['abstract'], expected=True) + if error_element is not None: + exception = find_xpath_attr( + error_element, _x('.//smil:param'), 'name', 'exception') + if exception is not None: + if exception.get('value') == 'GeoLocationBlocked': + self.raise_geo_restricted(error_element.attrib['abstract']) + elif error_element.attrib['src'].startswith( + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' + % self._TP_TLD): + raise ExtractorError( + error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, From aa7e974a2a61a20e017b52a3a9ab1fb43cf8cd13 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 29 Oct 2018 19:28:09 +0100 Subject: [PATCH 035/159] [linkedin:learning] Add new extractor(closes #13545) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/linkedin.py | 175 +++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 youtube_dl/extractor/linkedin.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d96e23905..8879f5d90 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -572,6 +572,10 @@ from .limelight import ( LimelightChannelListIE, ) from .line import LineTVIE +from .linkedin import ( + LinkedInLearningIE, + LinkedInLearningCourseIE, +) from .litv import LiTVIE from .liveleak import ( LiveLeakIE, diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py new file mode 100644 index 000000000..6333a8fd3 --- /dev/null +++ b/youtube_dl/extractor/linkedin.py @@ -0,0 +1,175 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + urlencode_postdata, +) + + +class LinkedInLearningBaseIE(InfoExtractor): + _NETRC_MACHINE = 'linkedin' + + def _call_api(self, course_slug, fields, video_slug=None, resolution=None): + query = { + 'courseSlug': course_slug, + 'fields': fields, + 'q': 'slugs', + } + sub = '' + if video_slug: + query.update({ + 'videoSlug': video_slug, + 'resolution': '_%s' % resolution, + }) + sub = ' %dp' % resolution + api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + return self._download_json( + api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ + 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, + }, query=query)['elements'][0] + + def _get_video_id(self, urn, course_slug, video_slug): + if urn: + mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) + if mobj: + return mobj.group(1) + return '%s/%s' % (course_slug, video_slug) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + 'https://www.linkedin.com/uas/login?trk=learning', + None, 'Downloading login page') + action_url = self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url') + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r']+class="error"[^>]*>\s*(.+?)\s*', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + +class LinkedInLearningIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P[^/]+)/(?P[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true', + 'md5': 'a1d74422ff0d5e66a792deb996693167', + 'info_dict': { + 'id': '90426', + 'ext': 'mp4', + 'title': 'Welcome', + 'timestamp': 1430396150.82, + 'upload_date': '20150430', + }, + } + + def _real_extract(self, url): + course_slug, video_slug = re.match(self._VALID_URL, url).groups() + + video_data = None + formats = [] + for width, height in ((640, 360), (960, 540), (1280, 720)): + video_data = self._call_api( + course_slug, 'selectedVideo', video_slug, height)['selectedVideo'] + + video_url_data = video_data.get('url') or {} + progressive_url = video_url_data.get('progressiveUrl') + if progressive_url: + formats.append({ + 'format_id': 'progressive-%dp' % height, + 'url': progressive_url, + 'height': height, + 'width': width, + 'source_preference': 1, + }) + + title = video_data['title'] + + audio_url = video_data.get('audio', {}).get('progressiveUrl') + if audio_url: + formats.append({ + 'abr': 64, + 'ext': 'm4a', + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + streaming_url = video_url_data.get('streamingUrl') + if streaming_url: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_slug, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr')) + + return { + 'id': self._get_video_id(video_data.get('urn'), course_slug, video_slug), + 'title': title, + 'formats': formats, + 'thumbnail': video_data.get('defaultThumbnail'), + 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), + 'duration': int_or_none(video_data.get('durationInSeconds')), + } + + +class LinkedInLearningCourseIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning:course' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals', + 'info_dict': { + 'id': 'programming-foundations-fundamentals', + 'title': 'Programming Foundations: Fundamentals', + 'description': 'md5:76e580b017694eb89dc8e8923fff5c86', + }, + 'playlist_mincount': 61, + } + + @classmethod + def suitable(cls, url): + return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_slug = self._match_id(url) + course_data = self._call_api(course_slug, 'chapters,description,title') + + entries = [] + for chapter in course_data.get('chapters', []): + chapter_title = chapter.get('title') + for video in chapter.get('videos', []): + video_slug = video.get('slug') + if not video_slug: + continue + entries.append({ + '_type': 'url', + 'id': self._get_video_id(video.get('urn'), course_slug, video_slug), + 'title': video.get('title'), + 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), + 'chapter': chapter_title, + 'ie_key': LinkedInLearningIE.ie_key(), + }) + + return self.playlist_result( + entries, course_slug, + course_data.get('title'), + course_data.get('description')) From b14475724b78bf3b4f2f448bb2953dfd52d3d425 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 29 Oct 2018 21:49:12 +0100 Subject: [PATCH 036/159] [linkedin:learning:course] use url_transparent type for playlist entries --- youtube_dl/extractor/linkedin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py index 6333a8fd3..259fc4c5e 100644 --- a/youtube_dl/extractor/linkedin.py +++ b/youtube_dl/extractor/linkedin.py @@ -161,7 +161,7 @@ class LinkedInLearningCourseIE(LinkedInLearningBaseIE): if not video_slug: continue entries.append({ - '_type': 'url', + '_type': 'url_transparent', 'id': self._get_video_id(video.get('urn'), course_slug, video_slug), 'title': video.get('title'), 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), From f16679e8436fb0e9d01aca2343ce22a01802667f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 30 Oct 2018 04:57:28 +0700 Subject: [PATCH 037/159] [cnbc:video] Fix _VALID_URL (#17110) --- youtube_dl/extractor/cnbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index 81b0c9fc4..6889b0f40 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -38,7 +38,7 @@ class CNBCIE(InfoExtractor): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www)?\.cnbc\.com/video/(?:[^/]+/)+(?P[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P[^./?#&]+)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { From c70ba664f19f0323d74e4e8ea76249f4c97def06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Nov 2018 01:35:32 +0700 Subject: [PATCH 038/159] [njpwworld] Fix authentication (closes #17427) --- youtube_dl/extractor/njpwworld.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index febef097a..025c5d249 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -31,6 +31,8 @@ class NJPWWorldIE(InfoExtractor): 'skip': 'Requires login', } + _LOGIN_URL = 'https://front.njpwworld.com/auth/login' + def _real_initialize(self): self._login() @@ -40,13 +42,17 @@ class NJPWWorldIE(InfoExtractor): if not username: return True + # Setup session (will set necessary cookies) + self._request_webpage( + 'https://njpwworld.com/', None, note='Setting up session') + webpage, urlh = self._download_webpage_handle( - 'https://njpwworld.com/auth/login', None, + self._LOGIN_URL, None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({'login_id': username, 'pw': password}), - headers={'Referer': 'https://njpwworld.com/auth'}) + headers={'Referer': 'https://front.njpwworld.com/auth'}) # /auth/login will return 302 for successful logins - if urlh.geturl() == 'https://njpwworld.com/auth/login': + if urlh.geturl() == self._LOGIN_URL: self.report_warning('unable to login') return False From 061ea3a776830f15dbce899bad8b48232f32aaf0 Mon Sep 17 00:00:00 2001 From: yonaikerlol Date: Fri, 2 Nov 2018 12:08:41 -0400 Subject: [PATCH 039/159] [openload] Add support for oload.fun --- youtube_dl/extractor/openload.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index a91f29f5c..2473536fd 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -319,7 +319,10 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }, { 'url': 'https://oload.icu/f/-_i4y_F_Hs8', - 'only_matching': True + 'only_matching': True, + }, { + 'url': 'https://oload.fun/f/gb6G1H4sHXY', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From c620694c97151396055108cd10d2a393036eb334 Mon Sep 17 00:00:00 2001 From: Sebastian Haas Date: Tue, 30 Oct 2018 23:44:50 +0100 Subject: [PATCH 040/159] [orf:tvthek] Fix extraction (closes #17737) use _extract_m3u8_formats and _extract_f4m_formats helper functions closes #17737 --- youtube_dl/extractor/orf.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index c1fb580ca..da8031ad2 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -80,14 +80,16 @@ class ORFTVthekIE(InfoExtractor): if not video_id or not title: continue video_id = compat_str(video_id) - formats = [{ - 'preference': -10 if fd['delivery'] == 'hls' else None, - 'format_id': '%s-%s-%s' % ( - fd['delivery'], fd['quality'], fd['quality_string']), - 'url': fd['src'], - 'protocol': fd['protocol'], - 'quality': quality_to_int(fd['quality']), - } for fd in sd['sources']] + formats = [] + for fd in sd['sources']: + format_id = '%s-%s-%s' % ( + fd['delivery'], fd['quality'], fd['quality_string']) + if determine_ext(fd['src']) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + fd['src'], video_id, 'mp4', m3u8_id=format_id)) + elif determine_ext(fd['src']) == 'f4m': + formats.extend(self._extract_f4m_formats( + fd['src'], video_id, f4m_id=format_id)) # Check for geoblocking. # There is a property is_geoprotection, but that's always false From 4b6aca17cc7d4df22e78501b4c00a9281c189ab3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Nov 2018 23:46:56 +0700 Subject: [PATCH 041/159] [orf:tvthek] Improve extraction and remove unused code (closes #17956, closes #18024) --- youtube_dl/extractor/orf.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index da8031ad2..d432e3449 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -15,6 +15,7 @@ from ..utils import ( strip_jsonp, unescapeHTML, unified_strdate, + url_or_none, ) @@ -68,12 +69,6 @@ class ORFTVthekIE(InfoExtractor): webpage, 'playlist', group='json'), playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - def quality_to_int(s): - m = re.search('([0-9]+)', s) - if m is None: - return -1 - return int(m.group(1)) - entries = [] for sd in data_jsb: video_id, title = sd.get('id'), sd.get('title') @@ -82,14 +77,27 @@ class ORFTVthekIE(InfoExtractor): video_id = compat_str(video_id) formats = [] for fd in sd['sources']: - format_id = '%s-%s-%s' % ( - fd['delivery'], fd['quality'], fd['quality_string']) + src = url_or_none(fd.get('src')) + if not src: + continue + format_id_list = [] + for key in ('delivery', 'quality', 'quality_string'): + value = fd.get(key) + if value: + format_id_list.append(value) + format_id = '-'.join(format_id_list) if determine_ext(fd['src']) == 'm3u8': formats.extend(self._extract_m3u8_formats( fd['src'], video_id, 'mp4', m3u8_id=format_id)) elif determine_ext(fd['src']) == 'f4m': formats.extend(self._extract_f4m_formats( fd['src'], video_id, f4m_id=format_id)) + else: + formats.append({ + 'format_id': format_id, + 'url': src, + 'protocol': fd.get('protocol'), + }) # Check for geoblocking. # There is a property is_geoprotection, but that's always false From 036f905161b104ef90e75ad42d472b45eeb102ba Mon Sep 17 00:00:00 2001 From: sichuan-pepper Date: Sat, 27 Oct 2018 03:40:44 +0900 Subject: [PATCH 042/159] [twitcasting] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/twitcasting.py | 44 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 youtube_dl/extractor/twitcasting.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8879f5d90..27452d73e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1196,6 +1196,7 @@ from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import TwitcastingIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py new file mode 100644 index 000000000..856df5c0b --- /dev/null +++ b/youtube_dl/extractor/twitcasting.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re + + +class TwitcastingIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|ssl|en|pt|es|ja|ko)\.)?twitcasting\.tv/(?P[^\/]+)/movie/(?P[0-9]+)' + _TEST = { + 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', + 'md5': '745243cad58c4681dc752490f7540d7f', + 'info_dict': { + 'id': '2357609', + 'ext': 'mp4', + 'title': 'Recorded Live #2357609', + 'uploader_id': 'ivetesangalo', + 'description': "Moi! I'm live on TwitCasting from my iPhone.", + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + uploader_id = mobj.group('uploader_id') + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._html_search_regex(r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage, name='playlist url', group='url') + formats = self._extract_m3u8_formats(playlist_url, video_id, ext='mp4') + thumbnail = self._og_search_thumbnail(webpage) + title = self._html_search_meta('twitter:title', webpage) + description = self._og_search_description(webpage) or self._html_search_meta('twitter:description', webpage) + return{ + 'id': video_id, + 'url': url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader_id': uploader_id, + 'formats': formats, + } From cf0db4d99785532d767d0ca1cc029c73d16bb045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 3 Nov 2018 00:27:36 +0700 Subject: [PATCH 043/159] [twitcasting] Improve extraction and fix issues (closes #17981) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/twitcasting.py | 36 +++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 27452d73e..b41cd65d7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1196,7 +1196,7 @@ from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE -from .twitcasting import TwitcastingIE +from .twitcasting import TwitCastingIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py index 856df5c0b..05f8aa9ce 100644 --- a/youtube_dl/extractor/twitcasting.py +++ b/youtube_dl/extractor/twitcasting.py @@ -6,8 +6,8 @@ from .common import InfoExtractor import re -class TwitcastingIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|ssl|en|pt|es|ja|ko)\.)?twitcasting\.tv/(?P[^\/]+)/movie/(?P[0-9]+)' +class TwitCastingIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P[^/]+)/movie/(?P\d+)' _TEST = { 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', 'md5': '745243cad58c4681dc752490f7540d7f', @@ -18,24 +18,40 @@ class TwitcastingIE(InfoExtractor): 'uploader_id': 'ivetesangalo', 'description': "Moi! I'm live on TwitCasting from my iPhone.", 'thumbnail': r're:^https?://.*\.jpg$', - } + }, + 'params': { + 'skip_download': True, + }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = mobj.group('id') uploader_id = mobj.group('uploader_id') webpage = self._download_webpage(url, video_id) - playlist_url = self._html_search_regex(r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage, name='playlist url', group='url') - formats = self._extract_m3u8_formats(playlist_url, video_id, ext='mp4') + title = self._html_search_regex( + r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)(?:(?!\1).)+)\1', + r'(["\'])(?Phttp.+?\.m3u8.*?)\1'), + webpage, 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + thumbnail = self._og_search_thumbnail(webpage) - title = self._html_search_meta('twitter:title', webpage) - description = self._og_search_description(webpage) or self._html_search_meta('twitter:description', webpage) - return{ + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage) + + return { 'id': video_id, - 'url': url, 'title': title, 'description': description, 'thumbnail': thumbnail, From 95e42d7336d01f505d6551a21df52f3ae234e96b Mon Sep 17 00:00:00 2001 From: Xiao Di Guan Date: Sat, 3 Nov 2018 05:18:20 +1100 Subject: [PATCH 044/159] [extractor/common] Ensure response handle is not prematurely closed before it can be read if it matches expected_status (resolves #17195, closes #17846, resolves #17447) --- test/helper.py | 10 ++++++++ test/test_InfoExtractor.py | 42 ++++++++++++++++++++++++++++++++-- test/test_downloader_http.py | 12 +--------- test/test_http.py | 10 +------- youtube_dl/extractor/common.py | 5 ++++ 5 files changed, 57 insertions(+), 22 deletions(-) diff --git a/test/helper.py b/test/helper.py index dfee217a9..aa9a1c9b2 100644 --- a/test/helper.py +++ b/test/helper.py @@ -7,6 +7,7 @@ import json import os.path import re import types +import ssl import sys import youtube_dl.extractor @@ -244,3 +245,12 @@ def expect_warnings(ydl, warnings_re): real_warning(w) ydl.report_warning = _report_warning + + +def http_server_port(httpd): + if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): + # In Jython SSLSocket is not a subclass of socket.socket + sock = httpd.socket.sock + else: + sock = httpd.socket + return sock.getsockname()[1] diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4833396a5..06be72616 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -9,11 +9,30 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value -from youtube_dl.compat import compat_etree_fromstring +from test.helper import FakeYDL, expect_dict, expect_value, http_server_port +from youtube_dl.compat import compat_etree_fromstring, compat_http_server from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError +import threading + + +TEAPOT_RESPONSE_STATUS = 418 +TEAPOT_RESPONSE_BODY = "

418 I'm a teapot

" + + +class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def do_GET(self): + if self.path == '/teapot': + self.send_response(TEAPOT_RESPONSE_STATUS) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.end_headers() + self.wfile.write(TEAPOT_RESPONSE_BODY.encode()) + else: + assert False class TestIE(InfoExtractor): @@ -743,6 +762,25 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ for i in range(len(entries)): expect_dict(self, entries[i], expected_entries[i]) + def test_response_with_expected_status_returns_content(self): + # Checks for mitigations against the effects of + # that affect Python 3.4.1+, which + # manifest as `_download_webpage`, `_download_xml`, `_download_json`, + # or the underlying `_download_webpage_handle` returning no content + # when a response matches `expected_status`. + + httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), InfoExtractorTestRequestHandler) + port = http_server_port(httpd) + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = True + server_thread.start() + + (content, urlh) = self.ie._download_webpage_handle( + 'http://127.0.0.1:%d/teapot' % port, None, + expected_status=TEAPOT_RESPONSE_STATUS) + self.assertEqual(content, TEAPOT_RESPONSE_BODY) + if __name__ == '__main__': unittest.main() diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 5cf2bf1a5..750472281 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,26 +9,16 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import try_rm +from test.helper import http_server_port, try_rm from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD from youtube_dl.utils import encodeFilename -import ssl import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - TEST_SIZE = 10 * 1024 diff --git a/test/test_http.py b/test/test_http.py index 409fec9c8..3ee0a5dda 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,6 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import http_server_port from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -16,15 +17,6 @@ import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8452125c8..e5f8136fc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -606,6 +606,11 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of + # introduced in Python 3.4.1. + err.fp._error = err return err.fp if errnote is False: From da56fb631fb6389849f2f021bdd047aa1c55dc0a Mon Sep 17 00:00:00 2001 From: Alexander Seiler Date: Tue, 2 Oct 2018 14:49:01 +0200 Subject: [PATCH 045/159] [azmedien] Adopt to major site redesign (closes #17745) --- youtube_dl/extractor/azmedien.py | 222 +++++++---------------------- youtube_dl/extractor/extractors.py | 6 +- 2 files changed, 53 insertions(+), 175 deletions(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index 68f26e2ca..9d606ee67 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -1,19 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - get_element_by_class, - get_element_by_id, - strip_or_none, - urljoin, -) class AZMedienBaseIE(InfoExtractor): + _PARTNER_ID = '1719221' + def _kaltura_video(self, partner_id, entry_id): return self.url_result( 'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(), @@ -25,189 +22,74 @@ class AZMedienIE(AZMedienBaseIE): _VALID_URL = r'''(?x) https?:// (?:www\.)? - (?: + (?P telezueri\.ch| telebaern\.tv| telem1\.ch )/ - [0-9]+-show-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - (?: - /[0-9]+-segment-(?:[^/\#]+\#)?| - \# - )| - \# + [^/]+/ + (?P + [^/]+-(?P\d+) ) - (?P[^\#]+) + (?: + \#video= + (?P + [_0-9a-z]+ + ) + )? ''' _TESTS = [{ - # URL with 'segment' - 'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom', + 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { - 'id': '1_2444peh4', + 'id': '1_anruz3wy', 'ext': 'mp4', - 'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom', - 'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8', - 'uploader_id': 'TeleZ?ri', - 'upload_date': '20161218', - 'timestamp': 1482084490, + 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', + 'description': 'md5:dd9f96751ec9c35e409a698a328402f3', + 'uploader_id': 'TVOnline', + 'upload_date': '20180930', + 'timestamp': 1538328802, }, 'params': { 'skip_download': True, }, }, { - # URL with 'segment' and fragment: - 'url': 'http://www.telebaern.tv/118-show-news/14240-episode-dienstag-17-januar-2017/33666-segment-achtung-gefahr#zu-wenig-pflegerinnen-und-pfleger', - 'only_matching': True - }, { - # URL with 'episode' and fragment: - 'url': 'http://www.telem1.ch/47-show-sonntalk/13986-episode-soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz#soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz', - 'only_matching': True - }, { - # URL with 'show' and fragment: - 'url': 'http://www.telezueri.ch/66-show-sonntalk#burka-plakate-trump-putin-china-besuch', + 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] def _real_extract(self, url): video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + entry_id = mobj.group('kaltura_id') - webpage = self._download_webpage(url, video_id) + if not entry_id: + webpage = self._download_webpage(url, video_id) + api_path = self._search_regex( + r'["\']apiPath["\']\s*:\s*["\']([^"^\']+)["\']', + webpage, 'api path') + api_url = 'https://www.%s%s' % (mobj.group('host'), api_path) + payload = { + 'query': '''query VideoContext($articleId: ID!) { + article: node(id: $articleId) { + ... on Article { + mainAssetRelation { + asset { + ... on VideoAsset { + kalturaId + } + } + } + } + } + }''', + 'variables': {'articleId': 'Article:%s' % mobj.group('article_id')}, + } + json_data = self._download_json( + api_url, video_id, headers={ + 'Content-Type': 'application/json', + }, + data=json.dumps(payload).encode()) + entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] - partner_id = self._search_regex( - r']+src=["\'](?:https?:)?//(?:[^/]+\.)?kaltura\.com(?:/[^/]+)*/(?:p|partner_id)/([0-9]+)', - webpage, 'kaltura partner id') - entry_id = self._html_search_regex( - r']+data-id=(["\'])(?P(?:(?!\1).)+)\1[^>]+data-slug=["\']%s' - % re.escape(video_id), webpage, 'kaltura entry id', group='id') - - return self._kaltura_video(partner_id, entry_id) - - -class AZMedienPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?P[0-9]+- - (?: - show| - topic| - themen - )-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - )? - )$ - ''' - - _TESTS = [{ - # URL with 'episode' - 'url': 'http://www.telebaern.tv/118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'info_dict': { - 'id': '118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'title': 'News - Donnerstag, 15. Dezember 2016', - }, - 'playlist_count': 9, - }, { - # URL with 'themen' - 'url': 'http://www.telem1.ch/258-themen-tele-m1-classics', - 'info_dict': { - 'id': '258-themen-tele-m1-classics', - 'title': 'Tele M1 Classics', - }, - 'playlist_mincount': 15, - }, { - # URL with 'topic', contains nested playlists - 'url': 'http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen', - 'only_matching': True, - }, { - # URL with 'show' only - 'url': 'http://www.telezueri.ch/86-show-talktaeglich', - 'only_matching': True - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) - - entries = [] - - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id', default=None) - - if partner_id: - entries = [ - self._kaltura_video(partner_id, m.group('id')) - for m in re.finditer( - r'data-id=(["\'])(?P(?:(?!\1).)+)\1', webpage)] - - if not entries: - entries = [ - self.url_result(m.group('url'), ie=AZMedienIE.ie_key()) - for m in re.finditer( - r']+data-real=(["\'])(?Phttp.+?)\1', webpage)] - - if not entries: - entries = [ - # May contain nested playlists (e.g. [1]) thus no explicit - # ie_key - # 1. http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen) - self.url_result(urljoin(url, m.group('url'))) - for m in re.finditer( - r']+name=[^>]+href=(["\'])(?P/.+?)\1', webpage)] - - title = self._search_regex( - r'episodeShareTitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'title', - default=strip_or_none(get_element_by_id( - 'video-title', webpage)), group='title') - - return self.playlist_result(entries, show_id, title) - - -class AZMedienShowPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien show playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?: - all-episodes| - alle-episoden - )/ - (?P<id>[^/?#&]+) - ''' - - _TEST = { - 'url': 'http://www.telezueri.ch/all-episodes/astrotalk', - 'info_dict': { - 'id': 'astrotalk', - 'title': 'TeleZüri: AstroTalk - alle episoden', - 'description': 'md5:4c0f7e7d741d906004266e295ceb4a26', - }, - 'playlist_mincount': 13, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - episodes = get_element_by_class('search-mobile-box', webpage) - entries = [self.url_result( - urljoin(url, m.group('url'))) for m in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', episodes)] - title = self._og_search_title(webpage, fatal=False) - description = self._og_search_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self._kaltura_video(self._PARTNER_ID, entry_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b41cd65d7..9b68c9efe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -88,11 +88,7 @@ from .awaan import ( AWAANLiveIE, AWAANSeasonIE, ) -from .azmedien import ( - AZMedienIE, - AZMedienPlaylistIE, - AZMedienShowPlaylistIE, -) +from .azmedien import AZMedienIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE From 573531dcfb6869afa143761faf4ebc6ab405f808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 3 Nov 2018 01:32:29 +0700 Subject: [PATCH 046/159] [azmedien] Simplify (closes #17746) --- youtube_dl/extractor/azmedien.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index 9d606ee67..a57a5f114 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -8,16 +8,7 @@ from .common import InfoExtractor from .kaltura import KalturaIE -class AZMedienBaseIE(InfoExtractor): - _PARTNER_ID = '1719221' - - def _kaltura_video(self, partner_id, entry_id): - return self.url_result( - 'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(), - video_id=entry_id) - - -class AZMedienIE(AZMedienBaseIE): +class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// @@ -58,9 +49,11 @@ class AZMedienIE(AZMedienBaseIE): 'only_matching': True }] + _PARTNER_ID = '1719221' + def _real_extract(self, url): - video_id = self._match_id(url) mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') entry_id = mobj.group('kaltura_id') if not entry_id: @@ -92,4 +85,6 @@ class AZMedienIE(AZMedienBaseIE): data=json.dumps(payload).encode()) entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] - return self._kaltura_video(self._PARTNER_ID, entry_id) + return self.url_result( + 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id) From faac1c1f70425ddd60ff39d3b6a2b34c7941463b Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Tue, 1 May 2018 05:36:03 +0200 Subject: [PATCH 047/159] [ehftv] Add extractor (closes #15408) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/laola1tv.py | 115 ++++++++++++++++++----------- 2 files changed, 73 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9b68c9efe..e5488cce4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -539,6 +539,7 @@ from .la7 import LA7IE from .laola1tv import ( Laola1TvEmbedIE, Laola1TvIE, + EHFTVIE, ITTFIE, ) from .lci import LCIIE diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index c7f813370..d985bd3ca 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -119,9 +120,59 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(Laola1TvEmbedIE): +class Laola1TvBaseIE(Laola1TvEmbedIE): + def _extract_video(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + if 'Dieser Livestream ist bereits beendet.' in webpage: + raise ExtractorError('This live stream has already finished.', expected=True) + + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, + transform_source=lambda s: js_to_json(re.sub(r'shareurl:.+,', '', s))) + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, + } + + +class Laola1TvIE(Laola1TvBaseIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -169,52 +220,30 @@ class Laola1TvIE(Laola1TvEmbedIE): }] def _real_extract(self, url): - display_id = self._match_id(url) + return self._extract_video(url) - webpage = self._download_webpage(url, display_id) - if 'Dieser Livestream ist bereits beendet.' in webpage: - raise ExtractorError('This live stream has already finished.', expected=True) +class EHFTVIE(Laola1TvBaseIE): + IE_NAME = 'ehftv' + _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)' - conf = self._parse_json(self._search_regex( - r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), - display_id, js_to_json) + _TESTS = [{ + 'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761', + 'info_dict': { + 'id': '1166761', + 'display_id': 'paris-saint-germain-handball-pge-vive-kielce', + 'ext': 'mp4', + 'title': 'Paris Saint-Germain Handball - PGE Vive Kielce', + 'is_live': False, + 'categories': ['Handball'], + }, + 'params': { + 'skip_download': True, + }, + }] - video_id = conf['videoid'] - - config = self._download_json(conf['configUrl'], video_id, query={ - 'videoid': video_id, - 'partnerid': conf['partnerid'], - 'language': conf.get('language', ''), - 'portal': conf.get('portalid', ''), - }) - error = config.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_data = config['video'] - title = video_data['title'] - is_live = video_data.get('isLivestream') and video_data.get('isLive') - meta = video_data.get('metaInformation') - sports = meta.get('sports') - categories = sports.split(',') if sports else [] - - token_url = self._extract_token_url( - video_data['streamAccess'], video_id, - video_data['abo']['required']) - - formats = self._extract_formats(token_url, video_id) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if is_live else title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('image'), - 'categories': categories, - 'formats': formats, - 'is_live': is_live, - } + def _real_extract(self, url): + return self._extract_video(url) class ITTFIE(InfoExtractor): From 6895ea4d3f9d44048fa59800a06ab8177a24bd1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 3 Nov 2018 02:44:35 +0700 Subject: [PATCH 048/159] [laola1tv:embed] Set correct stream access URL scheme (closes #16341) --- youtube_dl/extractor/laola1tv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index d985bd3ca..fa217365a 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -33,7 +33,8 @@ class Laola1TvEmbedIE(InfoExtractor): def _extract_token_url(self, stream_access_url, video_id, data): return self._download_json( - stream_access_url, video_id, headers={ + self._proto_relative_url(stream_access_url, 'https:'), video_id, + headers={ 'Content-Type': 'application/json', }, data=json.dumps(data).encode())['data']['stream-access'][0] @@ -225,7 +226,7 @@ class Laola1TvIE(Laola1TvBaseIE): class EHFTVIE(Laola1TvBaseIE): IE_NAME = 'ehftv' - _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(?:-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761', From a085410936a5e0c90a8eb3059a5cd9e0703848bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 3 Nov 2018 02:56:14 +0700 Subject: [PATCH 049/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ChangeLog b/ChangeLog index 57dbde12d..05857596a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +version <unreleased> + +Core +* [extractor/common] Ensure response handle is not prematurely closed before + it can be read if it matches expected_status (#17195, #17846, #17447) + +Extractors +* [laola1tv:embed] Set correct stream access URL scheme (#16341) ++ [ehftv] Add support for ehftv.com (#15408) +* [azmedien] Adopt to major site redesign (#17745, #17746) ++ [twitcasting] Add support for twitcasting.tv (#17981) +* [orf:tvthek] Fix extraction (#17737, #17956, #18024) ++ [openload] Add support for oload.fun (#18045) +* [njpwworld] Fix authentication (#17427) ++ [linkedin:learning] Add support for linkedin.com/learning (#13545) +* [theplatform] Improve error detection (#13222) +* [cnbc] Simplify extraction (#14280, #17110) ++ [cbnc] Add support for new URL schema (#14193) +* [aparat] Improve extraction and extract more metadata (#17445, #18008) +* [aparat] Fix extraction + + version 2018.10.29 Core From 38c32dbf19d0168295d02b0afaed9227fed46338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 3 Nov 2018 02:57:48 +0700 Subject: [PATCH 050/159] release 2018.11.03 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 7 +++++-- youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index aefed163a..eb8cef8ef 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.10.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.10.29** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.10.29 +[debug] youtube-dl version 2018.11.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 05857596a..11e1ba333 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.11.03 Core * [extractor/common] Ensure response handle is not prematurely closed before diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e5a6879bc..24c3254c3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -84,8 +84,6 @@ - **awaan:season** - **awaan:video** - **AZMedien**: AZ Medien videos - - **AZMedienPlaylist**: AZ Medien playlists - - **AZMedienShowPlaylist**: AZ Medien show playlists - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** @@ -178,6 +176,7 @@ - **Clyp** - **cmt.com** - **CNBC** + - **CNBCVideo** - **CNN** - **CNNArticle** - **CNNBlogs** @@ -251,6 +250,7 @@ - **EchoMsk** - **egghead:course**: egghead.io course - **egghead:lesson**: egghead.io lesson + - **ehftv** - **eHow** - **EinsUndEinsTV** - **Einthusan** @@ -445,6 +445,8 @@ - **limelight:channel** - **limelight:channel_list** - **LineTV** + - **linkedin:learning** + - **linkedin:learning:course** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -930,6 +932,7 @@ - **TVPlayer** - **TVPlayHome** - **Tweakers** + - **TwitCasting** - **twitch:chapter** - **twitch:clips** - **twitch:profile** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ae9a77966..90de01214 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.10.29' +__version__ = '2018.11.03' From dbdaaa231add0a8d1fa8138c448ccb344f585894 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 3 Nov 2018 06:26:16 +0700 Subject: [PATCH 051/159] [youtube] Add fallback metadata extraction from videoDetails (closes #18052) --- youtube_dl/extractor/youtube.py | 34 ++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 78203ef84..abadfa545 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -41,6 +41,7 @@ from ..utils import ( remove_quotes, remove_start, smuggle_url, + str_or_none, str_to_int, try_get, unescapeHTML, @@ -501,6 +502,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -583,6 +585,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -1538,6 +1541,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + player_response = {} + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -1580,6 +1585,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True sts = ytplayer_config.get('sts') + if not player_response: + pl_response = str_or_none(args.get('player_response')) + if pl_response: + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + player_response = pl_response if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1608,6 +1619,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info_webpage: continue get_video_info = compat_parse_qs(video_info_webpage) + if not player_response: + pl_response = get_video_info.get('player_response', [None])[0] + if isinstance(pl_response, dict): + player_response = pl_response add_dash_mpd(get_video_info) if view_count is None: view_count = extract_view_count(get_video_info) @@ -1653,9 +1668,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + video_details = try_get( + player_response, lambda x: x['videoDetails'], dict) or {} + # title if 'title' in video_info: video_title = video_info['title'][0] + elif 'title' in player_response: + video_title = video_details['title'] else: self._downloader.report_warning('Unable to extract video title') video_title = '_' @@ -1718,6 +1738,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if view_count is None: view_count = extract_view_count(video_info) + if view_count is None and video_details: + view_count = int_or_none(video_details.get('viewCount')) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1898,7 +1920,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + video_uploader = try_get( + video_info, lambda x: x['author'][0], + compat_str) or str_or_none(video_details.get('author')) if video_uploader: video_uploader = compat_urllib_parse_unquote_plus(video_uploader) else: @@ -2011,12 +2035,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): like_count = _extract_count('like') dislike_count = _extract_count('dislike') + if view_count is None: + view_count = str_to_int(self._search_regex( + r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, + 'view count', default=None)) + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) video_duration = try_get( video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = int_or_none(video_details.get('lengthSeconds')) if not video_duration: video_duration = parse_duration(self._html_search_meta( 'duration', video_webpage, 'video duration')) @@ -2244,6 +2275,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'categories': ['People & Blogs'], 'tags': list, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, From 22e07ce502275fbede32d212eacdaeabee22fe4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Nov 2018 00:11:36 +0700 Subject: [PATCH 052/159] [README.md] Improve documentation on safe metadata extraction and add more examples --- README.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fdd115c9b..35c3de512 100644 --- a/README.md +++ b/README.md @@ -1168,7 +1168,28 @@ title = self._search_regex( ### Use safe conversion functions -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` # EMBEDDING YOUTUBE-DL From 16d896b2a74e2b9989fc0483728f8009876fc4cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Nov 2018 15:52:46 +0700 Subject: [PATCH 053/159] [zattoo] Arrange API hosts for derived extractors (closes #18035) --- youtube_dl/extractor/zattoo.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index bbe0aecb6..cb1bac3a3 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -22,7 +22,7 @@ class ZattooPlatformBaseIE(InfoExtractor): _power_guide_hash = None def _host_url(self): - return 'https://%s' % self._HOST + return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) def _login(self): username, password = self._get_login_info() @@ -286,6 +286,7 @@ class ZattooLiveIE(ZattooBaseIE): class NetPlusIE(ZattooIE): _NETRC_MACHINE = 'netplus' _HOST = 'netplus.tv' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -300,7 +301,7 @@ class MNetTVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.tvplus.m-net.de/watch/abc/123-abc', + 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', 'only_matching': True, }] @@ -311,7 +312,7 @@ class WalyTVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.player.waly.tv/watch/abc/123-abc', + 'url': 'https://player.waly.tv/watch/abc/123-abc', 'only_matching': True, }] @@ -319,6 +320,7 @@ class WalyTVIE(ZattooIE): class BBVTVIE(ZattooIE): _NETRC_MACHINE = 'bbvtv' _HOST = 'bbv-tv.net' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -330,6 +332,7 @@ class BBVTVIE(ZattooIE): class VTXTVIE(ZattooIE): _NETRC_MACHINE = 'vtxtv' _HOST = 'vtxtv.ch' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -341,6 +344,7 @@ class VTXTVIE(ZattooIE): class MyVisionTVIE(ZattooIE): _NETRC_MACHINE = 'myvisiontv' _HOST = 'myvisiontv.ch' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -355,7 +359,7 @@ class GlattvisionTVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.iptv.glattvision.ch/watch/abc/123-abc', + 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', 'only_matching': True, }] @@ -363,6 +367,7 @@ class GlattvisionTVIE(ZattooIE): class SAKTVIE(ZattooIE): _NETRC_MACHINE = 'saktv' _HOST = 'saktv.ch' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -377,7 +382,7 @@ class EWETVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.tvonline.ewe.de/watch/abc/123-abc', + 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', 'only_matching': True, }] @@ -385,6 +390,7 @@ class EWETVIE(ZattooIE): class QuantumTVIE(ZattooIE): _NETRC_MACHINE = 'quantumtv' _HOST = 'quantum-tv.com' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -399,7 +405,7 @@ class OsnatelTVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.onlinetv.osnatel.de/watch/abc/123-abc', + 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', 'only_matching': True, }] @@ -407,6 +413,7 @@ class OsnatelTVIE(ZattooIE): class EinsUndEinsTVIE(ZattooIE): _NETRC_MACHINE = '1und1tv' _HOST = '1und1.tv' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ From 2004e2210bc74aa950feba0f22df4d5a8980b3e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Nov 2018 17:09:57 +0700 Subject: [PATCH 054/159] [osnateltv] Update host --- youtube_dl/extractor/zattoo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index cb1bac3a3..896276301 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -401,7 +401,7 @@ class QuantumTVIE(ZattooIE): class OsnatelTVIE(ZattooIE): _NETRC_MACHINE = 'osnateltv' - _HOST = 'onlinetv.osnatel.de' + _HOST = 'tvonline.osnatel.de' _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ From c0345b825f8758571a8de871ab9349c46b062fc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Nov 2018 19:08:39 +0700 Subject: [PATCH 055/159] [youtube:playlist] Add support for invidio.us (closes #18077) --- youtube_dl/extractor/youtube.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index abadfa545..6ab2db274 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2162,7 +2162,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?:https?://)? (?:\w+\.)? (?: - youtube\.com/ + (?: + youtube\.com| + invidio\.us + ) + / (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) \? (?:.*?[&;])*? (?:p|a|list)= @@ -2314,6 +2318,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # music album playlist 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }] def _real_initialize(self): From 432cd4841023091811db46cd82c188698a386841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 6 Nov 2018 23:29:42 +0700 Subject: [PATCH 056/159] [cliphinter] Fix extraction (closes #18083) --- youtube_dl/extractor/cliphunter.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index ab651d1c8..f2ca7a337 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,19 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none - - -_translation_table = { - 'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n', - 'm': 'a', 'n': 'm', 'p': 'u', 'q': 't', 'r': 's', 'v': 'p', 'x': 'r', - 'y': 'l', 'z': 'i', - '$': ':', '&': '.', '(': '=', '^': '&', '=': '/', -} - - -def _decode(s): - return ''.join(_translation_table.get(c, c) for c in s) +from ..utils import ( + int_or_none, + url_or_none, +) class CliphunterIE(InfoExtractor): @@ -60,14 +51,14 @@ class CliphunterIE(InfoExtractor): formats = [] for format_id, f in gexo_files.items(): - video_url = f.get('url') + video_url = url_or_none(f.get('url')) if not video_url: continue fmt = f.get('fmt') height = f.get('h') format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id formats.append({ - 'url': _decode(video_url), + 'url': video_url, 'format_id': format_id, 'width': int_or_none(f.get('w')), 'height': int_or_none(height), From 0df514f07e23ce70cccec045b4e71bdec151fcc7 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 6 Nov 2018 21:22:00 +0100 Subject: [PATCH 057/159] [facebook] fix tahoe request(closes #17171) --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 97cfe0fc3..74954049d 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -57,7 +57,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' - _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', From 2511eee215c2a66020ae927c86face826f48ba8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 7 Nov 2018 09:55:59 +0700 Subject: [PATCH 058/159] [youtube] Add another JS signature function name regex (closes #18091, closes #18093, closes #18094) --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6ab2db274..3f49f3889 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1192,7 +1192,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('), + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) From f81d44aab6d8ee01024a637cb80374251737872e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 7 Nov 2018 09:58:08 +0700 Subject: [PATCH 059/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index 11e1ba333..920a4855a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +version <unreleased> + +Extractors ++ [youtube] Add another JS signature function name regex (#18091, #18093, + #18094) +* [facebook] Fix tahoe request (#17171) +* [cliphunter] Fix extraction (#18083) ++ [youtube:playlist] Add support for invidio.us (#18077) +* [zattoo] Arrange API hosts for derived extractors (#18035) ++ [youtube] Add fallback metadata extraction from videoDetails (#18052) + + version 2018.11.03 Core From 532782ade1dab884606dbbd82081ed7ab9c52a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 7 Nov 2018 01:38:25 +0700 Subject: [PATCH 060/159] release 2018.11.07 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 23 ++++++++++++++++++++++- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index eb8cef8ef..7607e0e03 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.03** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.07** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.11.03 +[debug] youtube-dl version 2018.11.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 333acee80..bbcb78808 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -296,5 +296,26 @@ title = self._search_regex( ### Use safe conversion functions -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` diff --git a/ChangeLog b/ChangeLog index 920a4855a..fa5de8b04 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.11.07 Extractors + [youtube] Add another JS signature function name regex (#18091, #18093, diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 90de01214..7f32ad36c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.11.03' +__version__ = '2018.11.07' From cab26223bf480553d67840fc9f46aa9ff89ec29f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Nov 2018 15:22:59 +0700 Subject: [PATCH 061/159] [ruutu] Update API endpoint (closes #18138) --- youtube_dl/extractor/ruutu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 9fa8688f8..f530f0083 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -65,7 +65,8 @@ class RuutuIE(InfoExtractor): video_id = self._match_id(url) video_xml = self._download_xml( - 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id) + 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + query={'id': video_id}) formats = [] processed_urls = [] From 96a91b15513af2121be1dd93871cc3769c06da3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Nov 2018 23:37:27 +0700 Subject: [PATCH 062/159] [vivo] Fix extraction (closes #18139) --- youtube_dl/extractor/shared.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index b2250afdd..931a0f70e 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -5,6 +5,7 @@ from ..compat import compat_b64decode from ..utils import ( ExtractorError, int_or_none, + url_or_none, urlencode_postdata, ) @@ -86,9 +87,16 @@ class VivoIE(SharedBaseIE): } def _extract_video_url(self, webpage, video_id, *args): + def decode_url(encoded_url): + return compat_b64decode(encoded_url).decode('utf-8') + + stream_url = url_or_none(decode_url(self._search_regex( + r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'stream url', default=None, group='url'))) + if stream_url: + return stream_url return self._parse_json( self._search_regex( r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'stream', group='url'), - video_id, - transform_source=lambda x: compat_b64decode(x).decode('utf-8'))[0] + video_id, transform_source=decode_url)[0] From 83852e57bf2f96ba50418b1a888ae5a1836549cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Nov 2018 00:44:49 +0700 Subject: [PATCH 063/159] [zype] Add extractor (closes #18143) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 20 +++++++++++ youtube_dl/extractor/zype.py | 57 ++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 youtube_dl/extractor/zype.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5488cce4..b2b00c86f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1478,3 +1478,4 @@ from .zattoo import ( ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE +from .zype import ZypeIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 545e03371..59cf03faf 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -114,6 +114,7 @@ from .apa import APAIE from .foxnews import FoxNewsIE from .viqeo import ViqeoIE from .expressen import ExpressenIE +from .zype import ZypeIE class GenericIE(InfoExtractor): @@ -2070,6 +2071,20 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # Zype embed + 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'add_ie': [ZypeIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', @@ -3129,6 +3144,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) + zype_urls = ZypeIE._extract_urls(webpage) + if zype_urls: + return self.playlist_from_matches( + zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py new file mode 100644 index 000000000..3b16e703b --- /dev/null +++ b/youtube_dl/extractor/zype.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ZypeIE(InfoExtractor): + _VALID_URL = r'https?://player\.zype\.com/embed/(?P<id>[\da-fA-F]+)\.js\?.*?api_key=[^&]+' + _TEST = { + 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', + 'md5': 'eaee31d474c76a955bdaba02a505c595', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + } + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//player\.zype\.com/embed/[\da-fA-F]+\.js\?.*?api_key=.+?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + r'video_title\s*[:=]\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'title', group='value') + + m3u8_url = self._search_regex( + r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', webpage, + 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'poster\s*[:=]\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'thumbnail', + default=False, group='url') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } From f17a24a6df293370b94082c7feb6c447a3e7d8d9 Mon Sep 17 00:00:00 2001 From: Patrick Griffis <tingping@tingping.se> Date: Mon, 21 May 2018 17:02:16 -0400 Subject: [PATCH 064/159] [picarto] Use API and add token support This is just more reliable than trying to extract it from the page itself. --- youtube_dl/extractor/picarto.py | 36 +++++++++------------------------ 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 2366dfb34..27ee9643b 100644 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -8,14 +8,13 @@ from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, - try_get, update_url_query, urlencode_postdata, ) class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -34,19 +33,11 @@ class PicartoIE(InfoExtractor): def _real_extract(self, url): channel_id = self._match_id(url) - stream_page = self._download_webpage(url, channel_id) + metadata = self._download_json( + 'https://api.picarto.tv/v1/channel/name/' + channel_id, + channel_id) - if '>This channel does not exist' in stream_page: - raise ExtractorError( - 'Channel %s does not exist' % channel_id, expected=True) - - player = self._parse_json( - self._search_regex( - r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page, - 'player settings'), - channel_id, transform_source=js_to_json) - - if player.get('online') is False: + if metadata.get('online') is False: raise ExtractorError('Stream is offline', expected=True) cdn_data = self._download_json( @@ -54,20 +45,13 @@ class PicartoIE(InfoExtractor): data=urlencode_postdata({'loadbalancinginfo': channel_id}), note='Downloading load balancing info') - def get_event(key): - return try_get(player, lambda x: x['event'][key], compat_str) or '' - + token = self._VALID_URL_RE.match(url).group('token') or 'public' params = { - 'token': player.get('token') or '', - 'ticket': get_event('ticket'), 'con': int(time.time() * 1000), - 'type': get_event('ticket'), - 'scope': get_event('scope'), + 'token': token, } prefered_edge = cdn_data.get('preferedEdge') - default_tech = player.get('defaultTech') - formats = [] for edge in cdn_data['edges']: @@ -81,8 +65,6 @@ class PicartoIE(InfoExtractor): preference = 0 if edge_id == prefered_edge: preference += 1 - if tech_type == default_tech: - preference += 1 format_id = [] if edge_id: format_id.append(edge_id) @@ -109,7 +91,7 @@ class PicartoIE(InfoExtractor): continue self._sort_formats(formats) - mature = player.get('mature') + mature = metadata.get('adult') if mature is None: age_limit = None else: @@ -119,7 +101,7 @@ class PicartoIE(InfoExtractor): 'id': channel_id, 'title': self._live_title(channel_id), 'is_live': True, - 'thumbnail': player.get('vodThumb'), + 'thumbnail': metadata.get('thumbnails', {}).get('web'), 'age_limit': age_limit, 'formats': formats, } From 730c0d12a06f349907481570f1f2890251f7a181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Nov 2018 16:08:54 +0700 Subject: [PATCH 065/159] [picarto] Extract more metadata (closes #16518) --- youtube_dl/extractor/picarto.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 27ee9643b..8099ef1d6 100644 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re import time from .common import InfoExtractor @@ -8,6 +9,7 @@ from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, + try_get, update_url_query, urlencode_postdata, ) @@ -32,7 +34,9 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - channel_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + metadata = self._download_json( 'https://api.picarto.tv/v1/channel/name/' + channel_id, channel_id) @@ -45,7 +49,7 @@ class PicartoIE(InfoExtractor): data=urlencode_postdata({'loadbalancinginfo': channel_id}), note='Downloading load balancing info') - token = self._VALID_URL_RE.match(url).group('token') or 'public' + token = mobj.group('token') or 'public' params = { 'con': int(time.time() * 1000), 'token': token, @@ -99,9 +103,11 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(channel_id), + 'title': self._live_title(metadata.get('title') or channel_id), 'is_live': True, - 'thumbnail': metadata.get('thumbnails', {}).get('web'), + 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), + 'channel': channel_id, + 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, } From 9b9b3501c5bee18d608dd2961a80936667f8ece2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 16 Nov 2018 22:55:35 +0700 Subject: [PATCH 066/159] [tnaflixnetwork:embed] Fix extraction (closes #18205) --- youtube_dl/extractor/tnaflix.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 0c2f8f119..6798ef4c3 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -18,8 +18,9 @@ from ..utils import ( class TNAFlixNetworkBaseIE(InfoExtractor): # May be overridden in descendants if necessary _CONFIG_REGEX = [ - r'flashvars\.config\s*=\s*escape\("([^"]+)"', - r'<input[^>]+name="config\d?" value="([^"]+)"', + r'flashvars\.config\s*=\s*escape\("(?P<url>[^"]+)"', + r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', + r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', ] _HOST = 'tna' _VKEY_SUFFIX = '' @@ -85,7 +86,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): webpage = self._download_webpage(url, display_id) cfg_url = self._proto_relative_url(self._html_search_regex( - self._CONFIG_REGEX, webpage, 'flashvars.config', default=None), 'http:') + self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, + group='url'), 'http:') if not cfg_url: inputs = self._hidden_inputs(webpage) From 2599956c9ff0162e7afbddebb00d73eea6b0403c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 17 Nov 2018 00:07:59 +0700 Subject: [PATCH 067/159] [rte] Add support for new API endpoint (closes #18206) --- youtube_dl/extractor/rte.py | 129 +++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index a6fac6c35..1fbc72915 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -8,7 +8,10 @@ from ..compat import compat_HTTPError from ..utils import ( float_or_none, parse_iso8601, + str_or_none, + try_get, unescapeHTML, + url_or_none, ExtractorError, ) @@ -17,65 +20,87 @@ class RteBaseIE(InfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - try: - json_string = self._download_json( - 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, - item_id) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) - if error_info: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error_info['message']), - expected=True) - raise - - # NB the string values in the JSON are stored using XML escaping(!) - show = json_string['shows'][0] - title = unescapeHTML(show['title']) - description = unescapeHTML(show.get('description')) - thumbnail = show.get('thumbnail') - duration = float_or_none(show.get('duration'), 1000) - timestamp = parse_iso8601(show.get('published')) - - mg = show['media:group'][0] - + info_dict = {} formats = [] - if mg.get('url'): - m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) - if m: - m = m.groupdict() - formats.append({ - 'url': m['url'] + '/' + m['app'], - 'app': m['app'], - 'play_path': m['playpath'], - 'player_url': url, - 'ext': 'flv', - 'format_id': 'rtmp', - }) + ENDPOINTS = ( + 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', + ) - if mg.get('hls_server') and mg.get('hls_url'): - formats.extend(self._extract_m3u8_formats( - mg['hls_server'] + mg['hls_url'], item_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + for num, ep_url in enumerate(ENDPOINTS, start=1): + try: + data = self._download_json(ep_url + item_id, item_id) + except ExtractorError as ee: + if num < len(ENDPOINTS) or formats: + continue + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise - if mg.get('hds_server') and mg.get('hds_url'): - formats.extend(self._extract_f4m_formats( - mg['hds_server'] + mg['hds_url'], item_id, - f4m_id='hds', fatal=False)) + # NB the string values in the JSON are stored using XML escaping(!) + show = try_get(data, lambda x: x['shows'][0], dict) + if not show: + continue + + if not info_dict: + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + info_dict = { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + } + + mg = try_get(show, lambda x: x['media:group'][0], dict) + if not mg: + continue + + if mg.get('url'): + m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) + if m: + m = m.groupdict() + formats.append({ + 'url': m['url'] + '/' + m['app'], + 'app': m['app'], + 'play_path': m['playpath'], + 'player_url': url, + 'ext': 'flv', + 'format_id': 'rtmp', + }) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + mg_rte_server = str_or_none(mg.get('rte:server')) + mg_url = str_or_none(mg.get('url')) + if mg_rte_server and mg_url: + hds_url = url_or_none(mg_rte_server + mg_url) + if hds_url: + formats.extend(self._extract_f4m_formats( + hds_url, item_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) - return { - 'id': item_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } + info_dict['formats'] = formats + return info_dict class RteIE(RteBaseIE): From 0919cd4d011d0545a6afadfab0b71de8c0a4fe02 Mon Sep 17 00:00:00 2001 From: NeroBurner <pyro4hell@gmail.com> Date: Fri, 16 Nov 2018 18:18:50 +0100 Subject: [PATCH 068/159] [atvat] Fix extraction (closes #18041) --- youtube_dl/extractor/atvat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py index 1584d53fc..95e572d70 100644 --- a/youtube_dl/extractor/atvat.py +++ b/youtube_dl/extractor/atvat.py @@ -28,8 +28,10 @@ class ATVAtIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_data = self._parse_json(unescapeHTML(self._search_regex( - r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"', - webpage, 'player data')), display_id)['config']['initial_video'] + [r'flashPlayerOptions\s*=\s*(["\'])(?P<json>(?:(?!\1).)+)\1', + r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P<json>[^"]+)"'], + webpage, 'player data', group='json')), + display_id)['config']['initial_video'] video_id = video_data['id'] video_title = video_data['title'] From d0058c76d5e14ffe89e8265fa3d984e28e922d78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 17 Nov 2018 16:59:20 +0700 Subject: [PATCH 069/159] [openload] Use original host during extraction (closes #18211) --- youtube_dl/extractor/openload.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 2473536fd..cf51e4770 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,18 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' + _VALID_URL = r'''(?x) + https?:// + (?P<host> + (?:www\.)? + (?: + openload\.(?:co|io|link)| + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun) + ) + )/ + (?:f|embed)/ + (?P<id>[a-zA-Z0-9-_]+) + ''' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -334,8 +345,11 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): - video_id = self._match_id(url) - url_pattern = 'https://openload.co/%%s/%s/' % video_id + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') + + url_pattern = 'https://%s/%%s/%s/' % (host, video_id) headers = { 'User-Agent': self._USER_AGENT, } @@ -368,7 +382,7 @@ class OpenloadIE(InfoExtractor): r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, 'stream URL')) - video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id + video_url = 'https://%s/stream/%s?mime=true' % (host, decoded_id) title = self._og_search_title(webpage, default=None) or self._search_regex( r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, @@ -379,7 +393,7 @@ class OpenloadIE(InfoExtractor): entry = entries[0] if entries else {} subtitles = entry.get('subtitles') - info_dict = { + return { 'id': video_id, 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), @@ -388,4 +402,3 @@ class OpenloadIE(InfoExtractor): 'subtitles': subtitles, 'http_headers': headers, } - return info_dict From a640c4d226e7b790fe8db43f1c5bdf2358caf839 Mon Sep 17 00:00:00 2001 From: aviperes <avipr24@gmail.com> Date: Sat, 17 Nov 2018 15:59:13 +0200 Subject: [PATCH 070/159] [vk] Detect geo restriction --- youtube_dl/extractor/vk.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index ef8b9bcb7..b52d15ac6 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -293,8 +293,12 @@ class VKIE(VKBaseIE): # This video is no longer available, because its author has been blocked. 'url': 'https://vk.com/video-10639516_456240611', 'only_matching': True, - } - ] + }, + { + # The video is not available in your region. + 'url': 'https://vk.com/video-51812607_171445436', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -354,6 +358,9 @@ class VKIE(VKBaseIE): r'<!>This video is no longer available, because it has been deleted.': 'Video %s is no longer available, because it has been deleted.', + + r'<!>The video .+? is not available in your region.': + 'Video %s is not available in your region.', } for error_re, error_msg in ERRORS.items(): From 11d19ff50393cd195af884c6865aff6d89ed66ac Mon Sep 17 00:00:00 2001 From: mttronc <mrtn.mtth@gmx.de> Date: Thu, 6 Sep 2018 15:41:07 +0200 Subject: [PATCH 071/159] [wwe] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/wwe.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/wwe.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b2b00c86f..87c7d8b0c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1386,6 +1386,7 @@ from .wsj import ( WSJIE, WSJArticleIE, ) +from .wwe import WWEIE from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE diff --git a/youtube_dl/extractor/wwe.py b/youtube_dl/extractor/wwe.py new file mode 100644 index 000000000..c471a79f5 --- /dev/null +++ b/youtube_dl/extractor/wwe.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import urljoin + + +class WWEIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?wwe.com/(?:.*/)?videos/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018', + 'md5': '30cbc824b51f4010ea885bfcaec76972', + 'info_dict': { + 'id': '40048199', + 'ext': 'mp4', + 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018', + 'description': 'Still fuming after he and his wife Brie Bella were attacked by The Miz and Maryse last week, Daniel Bryan takes care of some unfinished business with Andrade "Cien" Almas.', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://de.wwe.com/videos/gran-metalik-vs-tony-nese-wwe-205-live-sept-4-2018', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + drupal_settings = self._parse_json( + self._html_search_regex( + r'(?s)Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), + display_id) + + player = drupal_settings['WWEVideoLanding']['initialVideo'] + metadata = player['playlist'][0] + + id = compat_str(metadata['nid']) + title = metadata.get('title') or self._og_search_title(webpage) + video_url = 'https:' + metadata['file'] + thumbnail = None + if metadata.get('image') is not None: + thumbnail = urljoin(url, metadata.get('image')) + description = metadata.get('description') + + formats = self._extract_m3u8_formats(video_url, id, 'mp4') + + return { + 'id': id, + 'title': title, + 'formats': formats, + 'url': video_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'description': description, + } From 006374e3aebc3be3f20b7e812c987cdf15b3ae35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 17 Nov 2018 23:59:20 +0700 Subject: [PATCH 072/159] [wwe] Fix issues, extract subtitles and add support for playlists (closes #14781, closes #17450) --- youtube_dl/extractor/wwe.py | 138 +++++++++++++++++++++++++++++------- 1 file changed, 111 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/wwe.py b/youtube_dl/extractor/wwe.py index c471a79f5..bebc77bb5 100644 --- a/youtube_dl/extractor/wwe.py +++ b/youtube_dl/extractor/wwe.py @@ -1,20 +1,75 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str -from ..utils import urljoin +from ..utils import ( + try_get, + unescapeHTML, + url_or_none, + urljoin, +) -class WWEIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?wwe.com/(?:.*/)?videos/(?P<id>[\w-]+)' +class WWEBaseIE(InfoExtractor): + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + } + + def _extract_entry(self, data, url, video_id=None): + video_id = compat_str(video_id or data['nid']) + title = data['title'] + + formats = self._extract_m3u8_formats( + data['file'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + description = data.get('description') + thumbnail = urljoin(url, data.get('image')) + series = data.get('show_name') + episode = data.get('episode_name') + + subtitles = {} + tracks = data.get('tracks') + if isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + track_file = url_or_none(track.get('file')) + if not track_file: + continue + label = track.get('label') + lang = self._SUBTITLE_LANGS.get(label, label) or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_file, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'series': series, + 'episode': episode, + 'formats': formats, + 'subtitles': subtitles, + } + + +class WWEIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*videos/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018', - 'md5': '30cbc824b51f4010ea885bfcaec76972', + 'md5': '92811c6a14bfc206f7a6a9c5d9140184', 'info_dict': { 'id': '40048199', 'ext': 'mp4', 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018', - 'description': 'Still fuming after he and his wife Brie Bella were attacked by The Miz and Maryse last week, Daniel Bryan takes care of some unfinished business with Andrade "Cien" Almas.', + 'description': 'md5:2d7424dbc6755c61a0e649d2a8677f67', 'thumbnail': r're:^https?://.*\.jpg$', } }, { @@ -26,31 +81,60 @@ class WWEIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json( + landing = self._parse_json( self._html_search_regex( - r'(?s)Drupal\.settings\s*,\s*({.+?})\);', + r'(?s)Drupal\.settings\s*,\s*({.+?})\s*\)\s*;', webpage, 'drupal settings'), - display_id) + display_id)['WWEVideoLanding'] - player = drupal_settings['WWEVideoLanding']['initialVideo'] - metadata = player['playlist'][0] + data = landing['initialVideo']['playlist'][0] + video_id = landing.get('initialVideoId') - id = compat_str(metadata['nid']) - title = metadata.get('title') or self._og_search_title(webpage) - video_url = 'https:' + metadata['file'] - thumbnail = None - if metadata.get('image') is not None: - thumbnail = urljoin(url, metadata.get('image')) - description = metadata.get('description') + info = self._extract_entry(data, url, video_id) + info['display_id'] = display_id + return info - formats = self._extract_m3u8_formats(video_url, id, 'mp4') - return { - 'id': id, - 'title': title, - 'formats': formats, - 'url': video_url, - 'display_id': display_id, - 'thumbnail': thumbnail, - 'description': description, - } +class WWEPlaylistIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/shows/raw/2018-11-12', + 'info_dict': { + 'id': '2018-11-12', + }, + 'playlist_mincount': 11, + }, { + 'url': 'http://www.wwe.com/article/walk-the-prank-wwe-edition', + 'only_matching': True, + }, { + 'url': 'https://www.wwe.com/shows/wwenxt/article/matt-riddle-interview', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if WWEIE.suitable(url) else super(WWEPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'data-video\s*=\s*(["\'])(?P<data>{.+?})\1', webpage): + video = self._parse_json( + mobj.group('data'), display_id, transform_source=unescapeHTML, + fatal=False) + if not video: + continue + data = try_get(video, lambda x: x['playlist'][0], dict) + if not data: + continue + try: + entry = self._extract_entry(data, url) + except Exception: + continue + entry['extractor_key'] = WWEIE.ie_key() + entries.append(entry) + + return self.playlist_result(entries, display_id) From 02df855e1339942c00f365dc59a0e418abb4b26f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Nov 2018 00:07:40 +0700 Subject: [PATCH 073/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index fa5de8b04..15daa1bec 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version <unreleased> + +Extractors ++ [wwe] Extract subtitles ++ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for wwe.com (#14781, #17450) +* [vk] Detect geo restriction (#17767) +* [openload] Use original host during extraction (#18211) +* [atvat] Fix extraction (#18041) ++ [rte] Add support for new API endpoint (#18206) +* [tnaflixnetwork:embed] Fix extraction (#18205) +* [picarto] Use API and add token support (#16518) ++ [zype] Add support for player.zype.com (#18143) +* [vivo] Fix extraction (#18139) +* [ruutu] Update API endpoint (#18138) + + version 2018.11.07 Extractors From 5bb04792696c41f66f2114b0cc05b01135fa1f68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Nov 2018 00:11:54 +0700 Subject: [PATCH 074/159] release 2018.11.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7607e0e03..905576364 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.07** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.11.07 +[debug] youtube-dl version 2018.11.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 15daa1bec..0083c4631 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.11.18 Extractors + [wwe] Extract subtitles diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 24c3254c3..9009f7e9e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1080,6 +1080,7 @@ - **wrzuta.pl:playlist** - **WSJ**: Wall Street Journal - **WSJArticle** + - **WWE** - **XBef** - **XboxClips** - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me @@ -1139,3 +1140,4 @@ - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn + - **Zype** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7f32ad36c..7f5ad7bf4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.11.07' +__version__ = '2018.11.18' From 4167148fa45e43a93ed202e5923223b7797340ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Nov 2018 01:07:54 +0700 Subject: [PATCH 075/159] [nova:embed] Fix extraction (closes #18222) --- youtube_dl/extractor/nova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 80186ec50..901f44b54 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -35,7 +35,7 @@ class NovaEmbedIE(InfoExtractor): bitrates = self._parse_json( self._search_regex( - r'(?s)bitrates\s*=\s*({.+?})\s*;', webpage, 'formats'), + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), video_id, transform_source=js_to_json) QUALITIES = ('lq', 'mq', 'hq', 'hd') From 1febf99da1924c46a491790639371f4ee9069193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Nov 2018 06:26:08 +0700 Subject: [PATCH 076/159] [pornhub] Add pornhub.net alias --- youtube_dl/extractor/pornhub.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 19eaf389f..7ee64dbf6 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -27,7 +27,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -340,7 +340,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/playlist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -355,7 +355,7 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -393,6 +393,9 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, }] def _real_extract(self, url): From f97c099131f625104f64a99a02a8c9894620171a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Nov 2018 11:14:46 +0700 Subject: [PATCH 077/159] [pornhub] Move test to correct place --- youtube_dl/extractor/pornhub.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7ee64dbf6..c9c884095 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -121,6 +121,9 @@ class PornHubIE(InfoExtractor): }, { 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, }] @staticmethod @@ -393,9 +396,6 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'only_matching': True, - }, { - 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', - 'only_matching': True, }] def _real_extract(self, url): From 964b989dc88c37b027481fb01de835b1e796ba5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Nov 2018 20:44:51 +0700 Subject: [PATCH 078/159] [americastestkitchen] Add support for zype embeds (closes #18225) --- youtube_dl/extractor/americastestkitchen.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 01736872d..8b32aa886 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -43,10 +43,6 @@ class AmericasTestKitchenIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id') - video_data = self._parse_json( self._search_regex( r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>', @@ -58,7 +54,18 @@ class AmericasTestKitchenIE(InfoExtractor): (lambda x: x['episodeDetail']['content']['data'], lambda x: x['videoDetail']['content']['data']), dict) ep_meta = ep_data.get('full_video', {}) - external_id = ep_data.get('external_id') or ep_meta['external_id'] + + zype_id = ep_meta.get('zype_id') + if zype_id: + embed_url = 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id + ie_key = 'Zype' + else: + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') + external_id = ep_data.get('external_id') or ep_meta['external_id'] + embed_url = 'kaltura:%s:%s' % (partner_id, external_id) + ie_key = 'Kaltura' title = ep_data.get('title') or ep_meta.get('title') description = clean_html(ep_meta.get('episode_description') or ep_data.get( @@ -72,8 +79,8 @@ class AmericasTestKitchenIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, external_id), - 'ie_key': 'Kaltura', + 'url': embed_url, + 'ie_key': ie_key, 'title': title, 'description': description, 'thumbnail': thumbnail, From 9b27a78a881bceb9d62f3364399d7572e8e2be24 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 18 Nov 2018 16:13:46 +0100 Subject: [PATCH 079/159] [kaltura] limit requested MediaEntry fields --- youtube_dl/extractor/kaltura.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 04f68fce4..fdf7f5bbc 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -192,6 +192,8 @@ class KalturaIE(InfoExtractor): 'entryId': video_id, 'service': 'baseentry', 'ks': '{1:result:ks}', + 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', + 'responseProfile:type': 1, }, { 'action': 'getbyentryid', From 8578ea4dcb17834ee3843e0e337c15af706f9803 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 18 Nov 2018 16:15:10 +0100 Subject: [PATCH 080/159] [bitchute] use _html_search_regex for title extraction --- youtube_dl/extractor/bitchute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 446a1ab19..43b4732aa 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -37,7 +37,7 @@ class BitChuteIE(InfoExtractor): 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', }) - title = self._search_regex( + title = self._html_search_regex( (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', From 2e1280ed432257244ea52a47efe6a7f0e226b897 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 19 Nov 2018 18:15:51 +0100 Subject: [PATCH 081/159] [sixplay] fix format extraction --- youtube_dl/extractor/sixplay.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 207ab4477..0c4f865ef 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -64,7 +64,7 @@ class SixPlayIE(InfoExtractor): for asset in clip_data['assets']: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') - if not asset_url or protocol == 'primetime' or asset_url in urls: + if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls: continue urls.append(asset_url) container = asset.get('video_container') @@ -81,19 +81,17 @@ class SixPlayIE(InfoExtractor): if not urlh: continue asset_url = urlh.geturl() - asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) - formats.extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - asset_url.replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_mpd_formats( - asset_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - re.sub(r'/[^/]+\.m3u8', '/Manifest', asset_url), - video_id, ism_id='mss', fatal=False)) + for i in range(3, 0, -1): + asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) + m3u8_formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + formats.extend(self._extract_mpd_formats( + asset_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + if m3u8_formats: + break else: formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', From 15ed5a27840e748d9f786c50b78a4c6326e9f186 Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Tue, 20 Nov 2018 20:50:40 +0100 Subject: [PATCH 082/159] [nzz] Relax kaltura regex --- youtube_dl/extractor/nzz.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nzz.py b/youtube_dl/extractor/nzz.py index 2d352f53f..61ee77adb 100644 --- a/youtube_dl/extractor/nzz.py +++ b/youtube_dl/extractor/nzz.py @@ -11,20 +11,27 @@ from ..utils import ( class NZZIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nzz\.ch/(?:[^/]+/)*[^/?#]+-ld\.(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nzz.ch/zuerich/gymizyte/gymizyte-schreiben-schueler-heute-noch-diktate-ld.9153', 'info_dict': { 'id': '9153', }, 'playlist_mincount': 6, - } + }, { + 'url': 'https://www.nzz.ch/video/nzz-standpunkte/cvp-auf-der-suche-nach-dem-mass-der-mitte-ld.1368112', + 'info_dict': { + 'id': '1368112', + }, + 'playlist_count': 1, + }] def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) entries = [] - for player_element in re.findall(r'(<[^>]+class="kalturaPlayer"[^>]*>)', webpage): + for player_element in re.findall( + r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): player_params = extract_attributes(player_element) if player_params.get('data-type') not in ('kaltura_singleArticle',): self.report_warning('Unsupported player type') From 05bd5e9c77e0e8acb95f47396be4c970fc9f39c4 Mon Sep 17 00:00:00 2001 From: Austin de Coup-Crank <austindcc@gmail.com> Date: Fri, 26 Oct 2018 19:15:44 -0700 Subject: [PATCH 083/159] [ciscolive] Add extractor --- youtube_dl/extractor/ciscolive.py | 136 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 137 insertions(+) create mode 100644 youtube_dl/extractor/ciscolive.py diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py new file mode 100644 index 000000000..2db7aad2c --- /dev/null +++ b/youtube_dl/extractor/ciscolive.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs +) +from ..utils import ( + clean_html, + int_or_none, + try_get, + urlencode_postdata, +) + + +class CiscoLiveIE(InfoExtractor): + IE_NAME = 'ciscolive' + _VALID_URL = r'(?:https?://)?ciscolive\.cisco\.com/on-demand-library/\??(?P<query>[^#]+)#/(?:session/(?P<id>.+))?$' + _TESTS = [ + { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'uploader_id': '5647924234001', + 'upload_date': '20180629', + 'location': '16B Mezz.', + }, + }, + { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'md5': '993d4cf051f6174059328b1dce8e94bd', + 'info_dict': { + 'upload_date': '20180629', + 'title': 'DevNet Panel-Applying Design Thinking to Building Products in Cisco', + 'timestamp': 1530316421, + 'uploader_id': '5647924234001', + 'id': '5803751616001', + 'description': 'md5:5f144575cd6848117fe2f756855b038b', + 'location': 'WoS, DevNet Theater', + 'ext': 'mp4', + }, + }, + { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'md5': '80e0c3b87e373fe3a3316b934b8915bf', + 'info_dict': { + 'upload_date': '20180629', + 'title': 'Beating the CCIE Routing & Switching', + 'timestamp': 1530311842, + 'uploader_id': '5647924234001', + 'id': '5803735679001', + 'description': 'md5:e71970799e92d7f5ff57ae23f64b0929', + 'location': 'Tulúm 02', + 'ext': 'mp4', + }, + } + ] + + # These appear to be constant across all Cisco Live presentations + # and are not tied to any user session or event + RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' + RAINFOCUS_APIPROFILEID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGETID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + + def _parse_rf_item(self, rf_item): + ''' Parses metadata and passes to Brightcove extractor ''' + event_name = rf_item.get('eventName') + title = rf_item['title'] + description = clean_html(rf_item.get('abstract')) + presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) + bc_id = rf_item['videos'][0]['url'] + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id + duration = int_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + location = try_get(rf_item, lambda x: x['times'][0]['room']) + + if duration: + duration = duration * 60 + + return { + '_type': 'url_transparent', + 'creator': presenter_name, + 'description': description, + 'duration': duration, + 'ie_key': 'BrightcoveNew', + 'location': location, + 'series': event_name, + 'title': title, + 'url': bc_url, + } + + def _check_bc_id_exists(self, rf_item): + ''' Checks for the existence of a Brightcove URL in an API result ''' + bc_id = try_get(rf_item, lambda x: x['videos'][0]['url']) + if bc_id: + if bc_id.strip().isdigit(): + return rf_item + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': self.RAINFOCUS_APIPROFILEID, + 'rfWidgetId': self.RAINFOCUS_WIDGETID, + 'Referer': url, + } + # Single session URL (single video) + if mobj.group('id'): + rf_id = mobj.group('id') + request = self.RAINFOCUS_API_URL % 'session' + data = urlencode_postdata({'id': rf_id}) + rf_result = self._download_json(request, rf_id, data=data, headers=HEADERS) + rf_item = self._check_bc_id_exists(rf_result['items'][0]) + return self._parse_rf_item(rf_item) + else: + # Filter query URL (multiple videos) + rf_query = compat_parse_qs((compat_urllib_parse_urlparse(url).query)) + rf_query['type'] = 'session' + rf_query['size'] = 1000 + data = urlencode_postdata(rf_query) + request = self.RAINFOCUS_API_URL % 'search' + rf_results = self._download_json(request, 'Filter query', data=data, headers=HEADERS) + entries = [ + self._parse_rf_item(rf_item) + for rf_item + in rf_results['sectionList'][0]['items'] + if self._check_bc_id_exists(rf_item) + ] + return self.playlist_result(entries, 'Filter query') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 87c7d8b0c..2c5988a14 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -194,6 +194,7 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE +from .ciscolive import CiscoLiveIE from .cjsw import CJSWIE from .cliphunter import CliphunterIE from .clippit import ClippitIE From 6a6d7f064178427d28986884524bd3434f0ca957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 21 Nov 2018 05:25:43 +0700 Subject: [PATCH 084/159] [ciscolive] Fix issues and improve extraction (closes #17984) --- youtube_dl/extractor/ciscolive.py | 176 ++++++++++++++--------------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 87 insertions(+), 94 deletions(-) diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py index 2db7aad2c..32f645713 100644 --- a/youtube_dl/extractor/ciscolive.py +++ b/youtube_dl/extractor/ciscolive.py @@ -1,84 +1,49 @@ # coding: utf-8 from __future__ import unicode_literals -import re from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urllib_parse_urlparse, - compat_parse_qs ) from ..utils import ( clean_html, + float_or_none, int_or_none, try_get, urlencode_postdata, ) -class CiscoLiveIE(InfoExtractor): - IE_NAME = 'ciscolive' - _VALID_URL = r'(?:https?://)?ciscolive\.cisco\.com/on-demand-library/\??(?P<query>[^#]+)#/(?:session/(?P<id>.+))?$' - _TESTS = [ - { - 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', - 'md5': 'c98acf395ed9c9f766941c70f5352e22', - 'info_dict': { - 'id': '5803694304001', - 'ext': 'mp4', - 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', - 'description': 'md5:ec4a436019e09a918dec17714803f7cc', - 'timestamp': 1530305395, - 'uploader_id': '5647924234001', - 'upload_date': '20180629', - 'location': '16B Mezz.', - }, - }, - { - 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', - 'md5': '993d4cf051f6174059328b1dce8e94bd', - 'info_dict': { - 'upload_date': '20180629', - 'title': 'DevNet Panel-Applying Design Thinking to Building Products in Cisco', - 'timestamp': 1530316421, - 'uploader_id': '5647924234001', - 'id': '5803751616001', - 'description': 'md5:5f144575cd6848117fe2f756855b038b', - 'location': 'WoS, DevNet Theater', - 'ext': 'mp4', - }, - }, - { - 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', - 'md5': '80e0c3b87e373fe3a3316b934b8915bf', - 'info_dict': { - 'upload_date': '20180629', - 'title': 'Beating the CCIE Routing & Switching', - 'timestamp': 1530311842, - 'uploader_id': '5647924234001', - 'id': '5803735679001', - 'description': 'md5:e71970799e92d7f5ff57ae23f64b0929', - 'location': 'Tulúm 02', - 'ext': 'mp4', - }, - } - ] - +class CiscoLiveBaseIE(InfoExtractor): # These appear to be constant across all Cisco Live presentations # and are not tied to any user session or event RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' - RAINFOCUS_APIPROFILEID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' - RAINFOCUS_WIDGETID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, + 'rfWidgetId': RAINFOCUS_WIDGET_ID, + } + + def _call_api(self, ep, rf_id, query, referrer): + headers = self.HEADERS.copy() + headers['Referer'] = referrer + return self._download_json( + self.RAINFOCUS_API_URL % ep, rf_id, data=urlencode_postdata(query), + headers=headers) + def _parse_rf_item(self, rf_item): - ''' Parses metadata and passes to Brightcove extractor ''' event_name = rf_item.get('eventName') title = rf_item['title'] description = clean_html(rf_item.get('abstract')) presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) bc_id = rf_item['videos'][0]['url'] bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id - duration = int_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) location = try_get(rf_item, lambda x: x['times'][0]['room']) if duration: @@ -86,51 +51,76 @@ class CiscoLiveIE(InfoExtractor): return { '_type': 'url_transparent', - 'creator': presenter_name, + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + 'title': title, 'description': description, 'duration': duration, - 'ie_key': 'BrightcoveNew', + 'creator': presenter_name, 'location': location, 'series': event_name, - 'title': title, - 'url': bc_url, } - def _check_bc_id_exists(self, rf_item): - ''' Checks for the existence of a Brightcove URL in an API result ''' - bc_id = try_get(rf_item, lambda x: x['videos'][0]['url']) - if bc_id: - if bc_id.strip().isdigit(): - return rf_item + +class CiscoLiveSessionIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/\??[^#]*#/session/(?P<id>[^/?&]+)' + _TEST = { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'upload_date': '20180629', + 'uploader_id': '5647924234001', + 'location': '16B Mezz.', + }, + 'params': { + 'proxy': '127.0.0.1:8118', + } + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - HEADERS = { - 'Origin': 'https://ciscolive.cisco.com', - 'rfApiProfileId': self.RAINFOCUS_APIPROFILEID, - 'rfWidgetId': self.RAINFOCUS_WIDGETID, - 'Referer': url, + rf_id = self._match_id(url) + rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) + return self._parse_rf_item(rf_result['items'][0]) + + +class CiscoLiveSearchIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'info_dict': { + 'title': 'Filter query', + }, + 'playlist_count': 5, + 'params': { + 'proxy': '127.0.0.1:8118', } - # Single session URL (single video) - if mobj.group('id'): - rf_id = mobj.group('id') - request = self.RAINFOCUS_API_URL % 'session' - data = urlencode_postdata({'id': rf_id}) - rf_result = self._download_json(request, rf_id, data=data, headers=HEADERS) - rf_item = self._check_bc_id_exists(rf_result['items'][0]) - return self._parse_rf_item(rf_item) - else: - # Filter query URL (multiple videos) - rf_query = compat_parse_qs((compat_urllib_parse_urlparse(url).query)) - rf_query['type'] = 'session' - rf_query['size'] = 1000 - data = urlencode_postdata(rf_query) - request = self.RAINFOCUS_API_URL % 'search' - rf_results = self._download_json(request, 'Filter query', data=data, headers=HEADERS) - entries = [ - self._parse_rf_item(rf_item) - for rf_item - in rf_results['sectionList'][0]['items'] - if self._check_bc_id_exists(rf_item) - ] - return self.playlist_result(entries, 'Filter query') + }, { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + + @staticmethod + def _check_bc_id_exists(rf_item): + return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + + def _real_extract(self, url): + rf_query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + rf_query['type'] = 'session' + rf_query['size'] = 1000 + rf_results = self._call_api('search', None, rf_query, url) + entries = [ + self._parse_rf_item(rf_item) + for rf_item + in rf_results['sectionList'][0]['items'] + if self._check_bc_id_exists(rf_item) + ] + return self.playlist_result(entries, playlist_title='Filter query') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2c5988a14..60e6175b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -194,7 +194,10 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .ciscolive import CiscoLiveIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) from .cjsw import CJSWIE from .cliphunter import CliphunterIE from .clippit import ClippitIE From 183417a50fd68c0c63b1d0621c6a0b44fbf2ac52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 21 Nov 2018 06:04:34 +0700 Subject: [PATCH 085/159] [ciscolive:search] Add support for pagination --- youtube_dl/extractor/ciscolive.py | 58 ++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py index 32f645713..c99b6ee58 100644 --- a/youtube_dl/extractor/ciscolive.py +++ b/youtube_dl/extractor/ciscolive.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools + from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -29,12 +31,12 @@ class CiscoLiveBaseIE(InfoExtractor): 'rfWidgetId': RAINFOCUS_WIDGET_ID, } - def _call_api(self, ep, rf_id, query, referrer): + def _call_api(self, ep, rf_id, query, referrer, note=None): headers = self.HEADERS.copy() headers['Referer'] = referrer return self._download_json( - self.RAINFOCUS_API_URL % ep, rf_id, data=urlencode_postdata(query), - headers=headers) + self.RAINFOCUS_API_URL % ep, rf_id, note=note, + data=urlencode_postdata(query), headers=headers) def _parse_rf_item(self, rf_item): event_name = rf_item.get('eventName') @@ -77,9 +79,6 @@ class CiscoLiveSessionIE(CiscoLiveBaseIE): 'uploader_id': '5647924234001', 'location': '16B Mezz.', }, - 'params': { - 'proxy': '127.0.0.1:8118', - } } def _real_extract(self, url): @@ -93,12 +92,9 @@ class CiscoLiveSearchIE(CiscoLiveBaseIE): _TESTS = [{ 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', 'info_dict': { - 'title': 'Filter query', + 'title': 'Search query', }, 'playlist_count': 5, - 'params': { - 'proxy': '127.0.0.1:8118', - } }, { 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', 'only_matching': True, @@ -112,15 +108,35 @@ class CiscoLiveSearchIE(CiscoLiveBaseIE): def _check_bc_id_exists(rf_item): return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + def _entries(self, query, url): + query['size'] = 50 + query['from'] = 0 + for page_num in itertools.count(1): + results = self._call_api( + 'search', None, query, url, + 'Downloading search JSON page %d' % page_num) + sl = try_get(results, lambda x: x['sectionList'][0], dict) + if sl: + results = sl + items = results.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + if not self._check_bc_id_exists(item): + continue + yield self._parse_rf_item(item) + size = int_or_none(results.get('size')) + if size is not None: + query['size'] = size + total = int_or_none(results.get('total')) + if total is not None and query['from'] + query['size'] > total: + break + query['from'] += query['size'] + def _real_extract(self, url): - rf_query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - rf_query['type'] = 'session' - rf_query['size'] = 1000 - rf_results = self._call_api('search', None, rf_query, url) - entries = [ - self._parse_rf_item(rf_item) - for rf_item - in rf_results['sectionList'][0]['items'] - if self._check_bc_id_exists(rf_item) - ] - return self.playlist_result(entries, playlist_title='Filter query') + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query['type'] = 'session' + return self.playlist_result( + self._entries(query, url), playlist_title='Search query') From 6c882aa8991383e1c39a6457cbde5dcab260bff5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 09:44:10 +0100 Subject: [PATCH 086/159] [loc] relax _VALID_URL regex and improve formats extraction --- youtube_dl/extractor/libraryofcongress.py | 37 +++++++++++++++-------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 40295a30b..1e5c82c66 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -16,7 +16,7 @@ from ..utils import ( class LibraryOfCongressIE(InfoExtractor): IE_NAME = 'loc' IE_DESC = 'Library of Congress' - _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)' _TESTS = [{ # embedded via <div class="media-player" 'url': 'http://loc.gov/item/90716351/', @@ -57,6 +57,12 @@ class LibraryOfCongressIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.loc.gov/item/ihas.200197114/', + 'only_matching': True, + }, { + 'url': 'https://www.loc.gov/item/afc1981005_afs20503/', + 'only_matching': True, }] def _real_extract(self, url): @@ -67,12 +73,13 @@ class LibraryOfCongressIE(InfoExtractor): (r'id=(["\'])media-player-(?P<id>.+?)\1', r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1', r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1', - r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'), + r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1', + r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'), webpage, 'media id', group='id') data = self._download_json( 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, - video_id)['mediaObject'] + media_id)['mediaObject'] derivative = data['derivatives'][0] media_url = derivative['derivativeUrl'] @@ -89,25 +96,29 @@ class LibraryOfCongressIE(InfoExtractor): if ext not in ('mp4', 'mp3'): media_url += '.mp4' if is_video else '.mp3' - if 'vod/mp4:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8', + formats = [] + if '/vod/mp4:' in media_url: + formats.append({ + 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8', 'format_id': 'hls', 'ext': 'mp4', 'protocol': 'm3u8_native', 'quality': 1, - }] - elif 'vod/mp3:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp3:', ''), - 'vcodec': 'none', - }] + }) + http_format = { + 'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url), + 'format_id': 'http', + 'quality': 1, + } + if not is_video: + http_format['vcodec'] = 'none' + formats.append(http_format) download_urls = set() for m in re.finditer( r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?: |\s+)\((?P<size>.+?)\))?\s*<', webpage): format_id = m.group('id').lower() - if format_id == 'gif': + if format_id in ('gif', 'jpeg'): continue download_url = m.group('url') if download_url in download_urls: From 35328915b5fe5c8915b924cfbc54bbdd6d6d1430 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 09:46:13 +0100 Subject: [PATCH 087/159] [foxsports] fix extraction(closes #17543) --- youtube_dl/extractor/foxsports.py | 17 +++-------------- youtube_dl/extractor/theplatform.py | 5 +++-- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index 985542727..596fded20 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -8,7 +8,7 @@ from ..utils import ( class FoxSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P<id>\d+)' _TEST = { 'url': 'http://www.foxsports.com/tennessee/video/432609859715', @@ -28,16 +28,5 @@ class FoxSportsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._html_search_regex( - r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", - webpage, 'data player config'), - video_id) - - return self.url_result(smuggle_url(update_url_query( - config['releaseURL'], { - 'mbr': 'true', - 'switch': 'http', - }), {'force_smil_url': True})) + return self.url_result( + 'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 181620615..90b351cbb 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -343,7 +343,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) entry = self._download_json(real_url, video_id)['entries'][0] - main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else None + main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl') formats = [] subtitles = {} @@ -356,7 +356,8 @@ class ThePlatformFeedIE(ThePlatformBaseIE): if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) - for asset_type in item['plfile$assetTypes']: + file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes'] + for asset_type in file_asset_types: if asset_type in asset_types: continue asset_types.append(asset_type) From 4e33e0792a3e134b494bd71f257a674294cca8d9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 12:00:50 +0100 Subject: [PATCH 088/159] [loc] update test --- youtube_dl/extractor/libraryofcongress.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 1e5c82c66..03f205144 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -20,12 +20,11 @@ class LibraryOfCongressIE(InfoExtractor): _TESTS = [{ # embedded via <div class="media-player" 'url': 'http://loc.gov/item/90716351/', - 'md5': '353917ff7f0255aa6d4b80a034833de8', + 'md5': '6ec0ae8f07f86731b1b2ff70f046210a', 'info_dict': { 'id': '90716351', 'ext': 'mp4', 'title': "Pa's trip to Mars", - 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 0, 'view_count': int, }, From 6866f2449437eeb0ad93b80e5bf39cf758af7a26 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 12:08:46 +0100 Subject: [PATCH 089/159] [foxsports] update test --- youtube_dl/extractor/foxsports.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index 596fded20..2b2cb6c6f 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -1,10 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - smuggle_url, - update_url_query, -) class FoxSportsIE(InfoExtractor): @@ -14,14 +10,19 @@ class FoxSportsIE(InfoExtractor): 'url': 'http://www.foxsports.com/tennessee/video/432609859715', 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'bwduI3X_TgUB', + 'id': '432609859715', 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', - 'upload_date': '20150423', - 'timestamp': 1429761109, + # TODO: fix timestamp + 'upload_date': '19700101', # '20150423', + # 'timestamp': 1429761109, 'uploader': 'NEWA-FNG-FOXSPORTS', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['ThePlatform'], } From a843464a7e0608b679651f913cbd9447a7b928c0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 12:10:06 +0100 Subject: [PATCH 090/159] [nbc] fix NBCNews article extraction(closes #16194) --- youtube_dl/extractor/nbc.py | 91 ++++++++----------------------------- 1 file changed, 19 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 765c46fd2..3282f84ee 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,8 @@ from .theplatform import ThePlatformIE from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( - find_xpath_attr, smuggle_url, try_get, - unescapeHTML, update_url_query, int_or_none, ) @@ -269,27 +267,14 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ - (?:video/.+?/(?P<id>\d+)| - ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+)) - ''' + _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' _TESTS = [ - { - 'url': 'http://www.nbcnews.com/video/nbc-news/52753292', - 'md5': '47abaac93c6eaf9ad37ee6c4463a5179', - 'info_dict': { - 'id': '52753292', - 'ext': 'flv', - 'title': 'Crew emerges after four-month Mars food study', - 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', - }, - }, { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'p_tweet_snow_140529', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', @@ -313,7 +298,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'nn_netcast_150204', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', @@ -326,7 +311,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', 'md5': 'a49e173825e5fcd15c13fc297fced39d', 'info_dict': { - 'id': 'x_lon_vwhorn_150922', + 'id': '529953347624', 'ext': 'mp4', 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', 'description': 'md5:c8be487b2d80ff0594c005add88d8351', @@ -339,7 +324,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', 'md5': '118d7ca3f0bea6534f119c68ef539f71', 'info_dict': { - 'id': 'tdy_al_space_160420', + 'id': '669831235788', 'ext': 'mp4', 'title': 'See the aurora borealis from space in stunning new NASA video', 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', @@ -352,7 +337,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', + 'id': '314487875924', 'ext': 'mp4', 'title': 'The chaotic GOP immigration vote', 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', @@ -374,60 +359,22 @@ class NBCNewsIE(ThePlatformIE): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if video_id is not None: - all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) - info = all_info.find('video') - - return { - 'id': video_id, - 'title': info.find('headline').text, - 'ext': 'flv', - 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': info.find('caption').text, - 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, - } - else: - # "feature" and "nightly-news" pages use theplatform.com - video_id = mobj.group('mpx_id') + video_id = self._match_id(url) + if not video_id.isdigit(): webpage = self._download_webpage(url, video_id) - filter_param = 'byId' - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"', - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);'], - webpage, 'bootstrap json', default=None) - if bootstrap_json: - bootstrap = self._parse_json( - bootstrap_json, video_id, transform_source=unescapeHTML) + data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.+});', webpage, + 'bootstrap json'), video_id) + video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id'] - info = None - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - elif 'msnbcVideoInfo' in bootstrap: - info = bootstrap['msnbcVideoInfo']['meta'] - elif 'msnbcThePlatform' in bootstrap: - info = bootstrap['msnbcThePlatform']['videoPlayer']['video'] - else: - info = bootstrap - - if 'guid' in info: - video_id = info['guid'] - filter_param = 'byGuid' - elif 'mpxId' in info: - video_id = info['mpxId'] - - return { - '_type': 'url_transparent', - 'id': video_id, - # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {filter_param: video_id}), - 'ie_key': 'ThePlatformFeed', - } + return { + '_type': 'url_transparent', + 'id': video_id, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}), + 'ie_key': 'ThePlatformFeed', + } class NBCOlympicsIE(InfoExtractor): From af60e81e3c557ace943aab35c1364d3d03d5a3bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:00:38 +0700 Subject: [PATCH 091/159] [setup.py] Add more relevant classifiers --- setup.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/setup.py b/setup.py index 7dbb5805f..a1a08f1e2 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,8 @@ setup( 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'License :: Public Domain', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', @@ -132,6 +134,12 @@ setup( 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: Implementation', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: IronPython', + 'Programming Language :: Python :: Implementation :: Jython', + 'Programming Language :: Python :: Implementation :: PyPy', ], cmdclass={'build_lazy_extractors': build_lazy_extractors}, From bd2d553c7b1529f793c2b7343c514a558543fc0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:01:22 +0700 Subject: [PATCH 092/159] [travis] Add python 3.7 build --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.travis.yml b/.travis.yml index 92f326860..1ea640071 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,12 @@ env: - YTDL_TEST_SET=download matrix: include: + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true From 157eef3e635230cbba0dd0c74f7115029867533e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:08:41 +0700 Subject: [PATCH 093/159] [setup.py] Add python 3.8 classifier --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index a1a08f1e2..dfb669ad2 100644 --- a/setup.py +++ b/setup.py @@ -135,6 +135,7 @@ setup( 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: Implementation', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: IronPython', From 305ce767d586e8796d873270abf771e69ff5586c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:34:35 +0700 Subject: [PATCH 094/159] [travis] Add python 3.8-dev build --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.travis.yml b/.travis.yml index 1ea640071..79287ccf6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,12 @@ matrix: - python: 3.7 dist: xenial env: YTDL_TEST_SET=download + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true From 560020da3049bec19e5714e9e24fc90fadd06582 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 23:19:46 +0100 Subject: [PATCH 095/159] [mixcloud] fallback to hardcoded decryption key(closes #18016) --- youtube_dl/extractor/mixcloud.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index b7bccb504..a2d19d3ef 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -161,11 +161,17 @@ class MixcloudIE(InfoExtractor): stream_info = info_json['streamInfo'] formats = [] + def decrypt_url(f_url): + for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): + decrypted_url = self._decrypt_xor_cipher(k, compat_b64decode(f_url)) + if re.search(r'^https?://[0-9a-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): + return decrypted_url + for url_key in ('url', 'hlsUrl', 'dashUrl'): format_url = stream_info.get(url_key) if not format_url: continue - decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url)) + decrypted = decrypt_url(format_url) if not decrypted: continue if url_key == 'hlsUrl': From 6f2883a2df45ca89d272bc8a0975f09758af5eb3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 23:25:38 +0100 Subject: [PATCH 096/159] [mixcloud] base64 decode before decryption --- youtube_dl/extractor/mixcloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a2d19d3ef..bcac13ec5 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -163,7 +163,7 @@ class MixcloudIE(InfoExtractor): def decrypt_url(f_url): for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): - decrypted_url = self._decrypt_xor_cipher(k, compat_b64decode(f_url)) + decrypted_url = self._decrypt_xor_cipher(k, f_url) if re.search(r'^https?://[0-9a-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): return decrypted_url @@ -171,7 +171,7 @@ class MixcloudIE(InfoExtractor): format_url = stream_info.get(url_key) if not format_url: continue - decrypted = decrypt_url(format_url) + decrypted = decrypt_url(compat_b64decode(format_url)) if not decrypted: continue if url_key == 'hlsUrl': From 66173211c4177d36612486acfd99fc4634b8004e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 23 Nov 2018 00:14:43 +0700 Subject: [PATCH 097/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0083c4631..beb002041 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +version <unreleased> + +Core ++ [setup.py] Add more relevant classifiers + +Extractors +* [mixcloud] Fallback to hardcoded decryption key (#18016) +* [nbc:news] Fix article extraction (#16194) +* [foxsports] Fix extraction (#17543) +* [loc] Relax regular expression and improve formats extraction ++ [ciscolive] Add support for ciscolive.cisco.com (#17984) +* [nzz] Relax kaltura regex (#18228) +* [sixplay] Fix formats extraction +* [bitchute] Improve title extraction +* [kaltura] Limit requested MediaEntry fields ++ [americastestkitchen] Add support for zype embeds (#18225) ++ [pornhub] Add pornhub.net alias +* [nova:embed] Fix extraction (#18222) + + version 2018.11.18 Extractors From d861a9d5814408973e0715bb9160fb7db34fbcd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 23 Nov 2018 00:16:45 +0700 Subject: [PATCH 098/159] release 2018.11.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 905576364..35cc8d6d0 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.11.18 +[debug] youtube-dl version 2018.11.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index beb002041..f82c7ea35 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.11.23 Core + [setup.py] Add more relevant classifiers diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9009f7e9e..7d72ad82d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -163,6 +163,8 @@ - **chirbit** - **chirbit:profile** - **Cinchcast** + - **CiscoLiveSearch** + - **CiscoLiveSession** - **CJSW** - **cliphunter** - **Clippit** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7f5ad7bf4..4956365d0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.11.18' +__version__ = '2018.11.23' From 6864855eb111dbf6e0efe9ed086f48efa1d9f209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 23 Nov 2018 00:43:42 +0700 Subject: [PATCH 099/159] [tests] Fix invalid escape sequences --- test/test_compat.py | 2 +- test/test_postprocessors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index d6c54e135..51fe6aa0b 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -39,7 +39,7 @@ class TestCompat(unittest.TestCase): def test_compat_expanduser(self): old_home = os.environ.get('HOME') - test_str = 'C:\Documents and Settings\тест\Application Data' + test_str = r'C:\Documents and Settings\тест\Application Data' compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) compat_setenv('HOME', old_home or '') diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index addb69d6f..4209d1d9a 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -14,4 +14,4 @@ from youtube_dl.postprocessor import MetadataFromTitlePP class TestMetadataFromTitle(unittest.TestCase): def test_format_to_regex(self): pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') - self.assertEqual(pp._titleregex, '(?P<title>.+)\ \-\ (?P<artist>.+)') + self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') From 641e86e3cf751f1050ca331b9d13152bd0e18558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 24 Nov 2018 21:47:41 +0700 Subject: [PATCH 100/159] [wistia] Add support for fast.wistia.com (closes #18287) --- youtube_dl/extractor/wistia.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 2182d6fd4..01a51275e 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -12,7 +12,7 @@ from ..utils import ( class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/iframe/)(?P<id>[a-z0-9]+)' _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' @@ -35,12 +35,15 @@ class WistiaIE(InfoExtractor): # with hls video 'url': 'wistia:807fafadvk', 'only_matching': True, + }, { + 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage) if match: return unescapeHTML(match.group('url')) From d19600df07128c73ef7242af7e1cd8c819951aba Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Sat, 24 Nov 2018 16:14:27 +0100 Subject: [PATCH 101/159] [joj] Fix extraction (closes #18280) --- youtube_dl/extractor/joj.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index d9f8dbfd2..62b28e980 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -61,7 +61,7 @@ class JojIE(InfoExtractor): bitrates = self._parse_json( self._search_regex( - r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates', + r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates', default='{}'), video_id, transform_source=js_to_json, fatal=False) From ca01d178844129bd4b6ed74740fbd30e7f84c1c2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 28 Nov 2018 19:53:22 +0100 Subject: [PATCH 102/159] [vimeo] Add support for VHX(Vimeo OTT)(#14835) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vimeo.py | 85 +++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 60e6175b1..cd91c0fcb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1303,6 +1303,7 @@ from .vimeo import ( VimeoReviewIE, VimeoUserIE, VimeoWatchLaterIE, + VHXEmbedIE, ) from .vimple import VimpleIE from .vine import ( diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 88f4d9979..6353c6831 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -14,10 +14,13 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, + js_to_json, InAdvancePagedList, int_or_none, merge_dicts, NO_DEFAULT, + parse_filesize, + qualities, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -27,7 +30,6 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, unescapeHTML, - parse_filesize, ) @@ -1063,3 +1065,84 @@ class VimeoLikesIE(InfoExtractor): 'description': description, 'entries': pl, } + + +class VHXEmbedIE(InfoExtractor): + IE_NAME = 'vhx:embed' + _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' + + def _call_api(self, video_id, access_token, path='', query=None): + return self._download_json( + 'https://api.vhx.tv/videos/' + video_id + path, video_id, headers={ + 'Authorization': 'Bearer ' + access_token, + }, query=query) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + credentials = self._parse_json(self._search_regex( + r'(?s)credentials\s*:\s*({.+?}),', webpage, + 'config'), video_id, js_to_json) + access_token = credentials['access_token'] + + query = {} + for k, v in credentials.items(): + if k in ('authorization', 'authUserToken', 'ticket') and v and v != 'undefined': + if k == 'authUserToken': + query['auth_user_token'] = v + else: + query[k] = v + files = self._call_api(video_id, access_token, '/files', query) + + formats = [] + for f in files: + href = try_get(f, lambda x: x['_links']['source']['href']) + if not href: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif method == 'dash': + formats.extend(self._extract_mpd_formats( + href, video_id, mpd_id='dash', fatal=False)) + else: + fmt = { + 'filesize': int_or_none(try_get(f, lambda x: x['size']['bytes'])), + 'format_id': 'http', + 'preference': 1, + 'url': href, + 'vcodec': f.get('codec'), + } + quality = f.get('quality') + if quality: + fmt.update({ + 'format_id': 'http-' + quality, + 'height': int_or_none(self._search_regex(r'(\d+)p', quality, 'height', default=None)), + }) + formats.append(fmt) + self._sort_formats(formats) + + video_data = self._call_api(video_id, access_token) + title = video_data.get('title') or video_data['name'] + + q = qualities(['small', 'medium', 'large', 'source']) + thumbnails = [] + for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': q(thumbnail_id), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), + 'formats': formats, + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(video_data.get('created_at')), + 'view_count': int_or_none(video_data.get('plays_count')), + } From d9df8f120b325766181fb474a8c534e51df78f17 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 28 Nov 2018 20:13:36 +0100 Subject: [PATCH 103/159] [vimeo] extract VHX subtitles --- youtube_dl/extractor/vimeo.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 6353c6831..5e15f060b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1127,6 +1127,17 @@ class VHXEmbedIE(InfoExtractor): video_data = self._call_api(video_id, access_token) title = video_data.get('title') or video_data['name'] + subtitles = {} + for subtitle in try_get(video_data, lambda x: x['tracks']['subtitles'], list) or []: + lang = subtitle.get('srclang') or subtitle.get('label') + for _link in subtitle.get('_links', {}).values(): + href = _link.get('href') + if not href: + continue + subtitles.setdefault(lang, []).append({ + 'url': href, + }) + q = qualities(['small', 'medium', 'large', 'source']) thumbnails = [] for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): @@ -1142,6 +1153,7 @@ class VHXEmbedIE(InfoExtractor): 'description': video_data.get('description'), 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'timestamp': unified_timestamp(video_data.get('created_at')), 'view_count': int_or_none(video_data.get('plays_count')), From 053e5b12b2e38b7d343aafbb7dc13fb8e4933015 Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Thu, 29 Nov 2018 18:12:18 +0100 Subject: [PATCH 104/159] [azmedien] Fix extraction (closes #18334) --- youtube_dl/extractor/azmedien.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index a57a5f114..fcbdc71b9 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -36,7 +36,6 @@ class AZMedienIE(InfoExtractor): 'id': '1_anruz3wy', 'ext': 'mp4', 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', - 'description': 'md5:dd9f96751ec9c35e409a698a328402f3', 'uploader_id': 'TVOnline', 'upload_date': '20180930', 'timestamp': 1538328802, @@ -53,15 +52,12 @@ class AZMedienIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') video_id = mobj.group('id') entry_id = mobj.group('kaltura_id') if not entry_id: - webpage = self._download_webpage(url, video_id) - api_path = self._search_regex( - r'["\']apiPath["\']\s*:\s*["\']([^"^\']+)["\']', - webpage, 'api path') - api_url = 'https://www.%s%s' % (mobj.group('host'), api_path) + api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0]) payload = { 'query': '''query VideoContext($articleId: ID!) { article: node(id: $articleId) { From adbbdefc8126a933d9ff0a6e603fb312e4b4cbdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 30 Nov 2018 00:48:15 +0700 Subject: [PATCH 105/159] [hotstar] Add support for alternative app state layout (closes #18320) --- youtube_dl/extractor/hotstar.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index bf5717f1b..45aa5e7ea 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -43,6 +43,7 @@ class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' _TESTS = [{ + # contentData 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { 'id': '1000076273', @@ -57,6 +58,10 @@ class HotStarIE(HotStarBaseIE): # m3u8 download 'skip_download': True, } + }, { + # contentDetail + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, }, { 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583', 'only_matching': True, @@ -74,8 +79,12 @@ class HotStarIE(HotStarBaseIE): r'<script>window\.APP_STATE\s*=\s*({.+?})</script>', webpage, 'app state'), video_id) video_data = {} + getters = ( + lambda x, k=k: x['initialState']['content%s' % k]['content'] + for k in ('Data', 'Detail') + ) for v in app_state.values(): - content = try_get(v, lambda x: x['initialState']['contentData']['content'], dict) + content = try_get(v, getters, dict) if content and content.get('contentId') == video_id: video_data = content From 16597c2f9492651c55e01f441ee9cb5c276209cb Mon Sep 17 00:00:00 2001 From: Jimm Stout <jamesstout1@gmail.com> Date: Thu, 29 Nov 2018 13:07:07 -0500 Subject: [PATCH 106/159] [gfycat] Update API endpoint (closes #18333) --- youtube_dl/extractor/gfycat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index a0670b645..c1b36a59b 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -53,7 +53,7 @@ class GfycatIE(InfoExtractor): video_id = self._match_id(url) gfy = self._download_json( - 'http://gfycat.com/cajax/get/%s' % video_id, + 'https://api.gfycat.com/v1/gfycats/%s' % video_id, video_id, 'Downloading video info') if 'error' in gfy: raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True) From f012823082c893c0fc1f96afb8a91f5b1c1ae07a Mon Sep 17 00:00:00 2001 From: Hakim Boyles <hak@volkanite.net> Date: Thu, 29 Nov 2018 14:20:27 -0400 Subject: [PATCH 107/159] [lynda] Fix authentication (closes #18158) --- youtube_dl/extractor/lynda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 4ba61cd8a..3084c6dff 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -15,7 +15,7 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): - _SIGNIN_URL = 'https://www.lynda.com/signin' + _SIGNIN_URL = 'https://www.lynda.com/signin/lynda' _PASSWORD_URL = 'https://www.lynda.com/signin/password' _USER_URL = 'https://www.lynda.com/signin/user' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' From 3430ff9b07d4dc9dd39617af54aeffb381d88737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Dec 2018 16:45:51 +0700 Subject: [PATCH 108/159] [pornhub] Use actual URL host for requests (closes #18359) --- youtube_dl/extractor/pornhub.py | 34 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index c9c884095..e377de196 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -27,7 +27,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -129,7 +129,7 @@ class PornHubIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)', + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -137,14 +137,16 @@ class PornHubIE(InfoExtractor): pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') - self._set_cookie('pornhub.com', 'age_verified', '1') + self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): - self._set_cookie('pornhub.com', 'platform', platform) + self._set_cookie(host, 'platform', platform) return self._download_webpage( - 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, + 'http://www.%s/view_video.php?viewkey=%s' % (host, video_id), video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') @@ -306,7 +308,7 @@ class PornHubIE(InfoExtractor): class PornHubPlaylistBaseIE(InfoExtractor): - def _extract_entries(self, webpage): + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see # https://github.com/rg3/youtube-dl/issues/11594). @@ -316,7 +318,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): return [ self.url_result( - 'http://www.pornhub.com/%s' % video_url, + 'http://www.%s/%s' % (host, video_url), PornHubIE.ie_key(), video_title=title) for video_url, title in orderedSet(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', @@ -324,11 +326,13 @@ class PornHubPlaylistBaseIE(InfoExtractor): ] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) - entries = self._extract_entries(webpage) + entries = self._extract_entries(webpage, host) playlist = self._parse_json( self._search_regex( @@ -343,7 +347,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/playlist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/playlist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -358,7 +362,7 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -399,7 +403,9 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }] def _real_extract(self, url): - user_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + user_id = mobj.group('id') entries = [] for page_num in itertools.count(1): @@ -411,7 +417,7 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: break raise - page_entries = self._extract_entries(webpage) + page_entries = self._extract_entries(webpage, host) if not page_entries: break entries.extend(page_entries) From aa374bc78e5e4dbb8453bd367ae0e3c0db702a7a Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Sat, 1 Dec 2018 18:05:15 +0100 Subject: [PATCH 109/159] [utils] Fix random_birthday to generate existing dates only --- youtube_dl/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e84d35d4d..0b1c7cd6c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3948,8 +3948,12 @@ def write_xattr(path, key, value): def random_birthday(year_field, month_field, day_field): + start_date = datetime.date(1950, 1, 1) + end_date = datetime.date(1995, 12, 31) + offset = random.randint(0, (end_date - start_date).days) + random_date = start_date + datetime.timedelta(offset) return { - year_field: str(random.randint(1950, 1995)), - month_field: str(random.randint(1, 12)), - day_field: str(random.randint(1, 31)), + year_field: str(random_date.year), + month_field: str(random_date.month), + day_field: str(random_date.day), } From 1ead840d2c18d4add340117f676fd6694f0650d3 Mon Sep 17 00:00:00 2001 From: Ken Swenson <flat@esoteric.moe> Date: Fri, 9 Nov 2018 16:49:20 -0500 Subject: [PATCH 110/159] [tiktok] Add extractor (closes #18108) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tiktok.py | 79 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/tiktok.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cd91c0fcb..547331078 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1124,6 +1124,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE +from .tiktok import TikTokIE from .tinypic import TinyPicIE from .tmz import ( TMZIE, diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py new file mode 100644 index 000000000..d71b09c66 --- /dev/null +++ b/youtube_dl/extractor/tiktok.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_str, + int_or_none, + str_or_none, + try_get, + url_or_none, +) + + +class TikTokIE(InfoExtractor): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://m.tiktok.com/v/6606727368545406213.html', + 'md5': 'd584b572e92fcd48888051f238022420', + 'info_dict': { + 'id': '6606727368545406213', + 'ext': 'mp4', + 'title': 'Zureeal on TikTok', + 'thumbnail': r're:^https?://.*~noop.image', + 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', + 'uploader': 'Zureeal', + 'width': 540, + 'height': 960, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'var\s+data\s*=\s*({.+?});', webpage, 'data' + ), video_id) + + title = self._og_search_title(webpage) + + description = str_or_none(try_get(data, lambda x: x['desc'])) + width = int_or_none(try_get(data, lambda x: x['video']['width'])) + height = int_or_none(try_get(data, lambda x: x['video']['height'])) + + formats = [] + + for count, (key, label) in enumerate((('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')), -2): + for format in try_get(data, lambda x: x['video'][key]['url_list']): + format_url = url_or_none(format) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + 'format_note': label, + 'quality': count + }) + + self._sort_formats(formats) + + uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) + + thumbnail = url_or_none( + try_get( + data, lambda x: x['video']['cover']['url_list'][0], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'formats': formats, + 'thumbnail': thumbnail, + 'width': width, + 'height': height, + } From ce18a19be9af1e227e7162636cbf6b277adc1b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Dec 2018 02:39:22 +0700 Subject: [PATCH 111/159] [tiktok] Improve extraction and add support for user pages (closes #18135) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tiktok.py | 136 ++++++++++++++++++----------- 2 files changed, 91 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 547331078..0baed6b27 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1124,7 +1124,10 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE -from .tiktok import TikTokIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, +) from .tinypic import TinyPicIE from .tmz import ( TMZIE, diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index d71b09c66..083e9f36d 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( compat_str, + ExtractorError, int_or_none, str_or_none, try_get, @@ -11,69 +12,106 @@ from ..utils import ( ) -class TikTokIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>[0-9]+)' +class TikTokBaseIE(InfoExtractor): + def _extract_aweme(self, data): + video = data['video'] + description = str_or_none(try_get(data, lambda x: x['desc'])) + width = int_or_none(try_get(data, lambda x: video['width'])) + height = int_or_none(try_get(data, lambda x: video['height'])) + + format_urls = set() + formats = [] + for format_id in ( + 'play_addr_lowbr', 'play_addr', 'play_addr_h264', + 'download_addr'): + for format in try_get( + video, lambda x: x[format_id]['url_list'], list) or []: + format_url = url_or_none(format) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + }) + self._sort_formats(formats) + + thumbnail = url_or_none(try_get( + video, lambda x: x['cover']['url_list'][0], compat_str)) + uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) + timestamp = int_or_none(data.get('create_time')) + comment_count = int_or_none(data.get('comment_count')) or int_or_none( + try_get(data, lambda x: x['statistics']['comment_count'])) + repost_count = int_or_none(try_get( + data, lambda x: x['statistics']['share_count'])) + + aweme_id = data['aweme_id'] + + return { + 'id': aweme_id, + 'title': uploader or aweme_id, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'repost_count': repost_count, + 'formats': formats, + } + + +class TikTokIE(TikTokBaseIE): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>\d+)' _TEST = { 'url': 'https://m.tiktok.com/v/6606727368545406213.html', 'md5': 'd584b572e92fcd48888051f238022420', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', - 'title': 'Zureeal on TikTok', - 'thumbnail': r're:^https?://.*~noop.image', + 'title': 'Zureeal', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', + 'thumbnail': r're:^https?://.*~noop.image', 'uploader': 'Zureeal', - 'width': 540, - 'height': 960, + 'timestamp': 1538248586, + 'upload_date': '20180929', + 'comment_count': int, + 'repost_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + data = self._parse_json(self._search_regex( + r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) + return self._extract_aweme(data) - data = self._parse_json( - self._search_regex( - r'var\s+data\s*=\s*({.+?});', webpage, 'data' - ), video_id) - title = self._og_search_title(webpage) +class TikTokUserIE(TikTokBaseIE): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/h5/share/usr/(?P<id>\d+)' + _TEST = { + 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', + 'info_dict': { + 'id': '188294915489964032', + }, + 'playlist_mincount': 24, + } - description = str_or_none(try_get(data, lambda x: x['desc'])) - width = int_or_none(try_get(data, lambda x: x['video']['width'])) - height = int_or_none(try_get(data, lambda x: x['video']['height'])) - - formats = [] - - for count, (key, label) in enumerate((('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')), -2): - for format in try_get(data, lambda x: x['video'][key]['url_list']): - format_url = url_or_none(format) - if not format_url: - continue - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'height': height, - 'width': width, - 'format_note': label, - 'quality': count - }) - - self._sort_formats(formats) - - uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) - - thumbnail = url_or_none( - try_get( - data, lambda x: x['video']['cover']['url_list'][0], compat_str)) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'uploader': uploader, - 'formats': formats, - 'thumbnail': thumbnail, - 'width': width, - 'height': height, - } + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._download_json( + 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, + query={'_signature': '_'}) + entries = [] + for aweme in data['aweme_list']: + try: + entry = self._extract_aweme(aweme) + except ExtractorError: + continue + entry['extractor_key'] = TikTokIE.ie_key() + entries.append(entry) + return self.playlist_result(entries, user_id) From 1fa59a928e48ac81e2afb5dfe134fa6c40f64e44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Dec 2018 00:06:54 +0700 Subject: [PATCH 112/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index f82c7ea35..ddd4bec14 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version <unreleased> + +Core +* [utils] Fix random_birthday to generate existing dates only (#18284) + +Extractors ++ [tiktok] Add support for tiktok.com (#18108, #18135) +* [pornhub] Use actual URL host for requests (#18359) +* [lynda] Fix authentication (#18158, #18217) +* [gfycat] Update API endpoint (#18333, #18343) ++ [hotstar] Add support for alternative app state layout (#18320) +* [azmedien] Fix extraction (#18334, #18336) ++ [vimeo] Add support for VHX (Vimeo OTT) (#14835) +* [joj] Fix extraction (#18280, #18281) ++ [wistia] Add support for fast.wistia.com (#18287) + + version 2018.11.23 Core From ab896fa894d2395b886e5bb8168ca0e0e6e9517d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Dec 2018 00:10:20 +0700 Subject: [PATCH 113/159] release 2018.12.03 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 35cc8d6d0..4b35244a8 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.11.23 +[debug] youtube-dl version 2018.12.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ddd4bec14..689d07826 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.12.03 Core * [utils] Fix random_birthday to generate existing dates only (#18284) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7d72ad82d..837b0199b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -885,6 +885,8 @@ - **ThisAmericanLife** - **ThisAV** - **ThisOldHouse** + - **TikTok** + - **TikTokUser** - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -979,6 +981,7 @@ - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** + - **vhx:embed** - **Viafree** - **vice** - **vice:article** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4956365d0..8e1203892 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.11.23' +__version__ = '2018.12.03' From 5547014ad972a4364c4d4a613db6d3a18f25950e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 2 Dec 2018 20:01:36 +0100 Subject: [PATCH 114/159] [gamespot] add support reviews URLs --- youtube_dl/extractor/gamespot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index ab647dd41..4236a5ed8 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(OnceIE): - _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article|review)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -41,6 +41,9 @@ class GameSpotIE(OnceIE): }, { 'url': 'https://www.gamespot.com/articles/the-last-of-us-2-receives-new-ps4-trailer/1100-6454469/', 'only_matching': True, + }, { + 'url': 'https://www.gamespot.com/reviews/gears-of-war-review/1900-6161188/', + 'only_matching': True, }] def _real_extract(self, url): From 8bb0c9cc16b841ab89e15e620e00ee954ab316ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 5 Dec 2018 07:03:00 +0100 Subject: [PATCH 115/159] [tbs] fix info extraction(fixes #18403) --- youtube_dl/extractor/tbs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index 784f8ed66..e8a7c65e0 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -16,7 +16,7 @@ from ..utils import ( class TBSIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))' _TESTS = [{ 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', 'info_dict': { @@ -40,12 +40,12 @@ class TBSIE(TurnerBaseIE): }] def _real_extract(self, url): - site, display_id = re.match(self._VALID_URL, url).groups() + site, path, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) drupal_settings = self._parse_json(self._search_regex( r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', webpage, 'drupal setting'), display_id) - video_data = drupal_settings['turner_playlist'][0] + video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path) media_id = video_data['mediaID'] title = video_data['title'] From ae9d77dab54630ac480bd4082468d543a15c341e Mon Sep 17 00:00:00 2001 From: v-delta <45652398+v-delta@users.noreply.github.com> Date: Thu, 6 Dec 2018 17:24:35 +0100 Subject: [PATCH 116/159] [yourporn] Fix extraction (closes #18424) --- youtube_dl/extractor/yourporn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index 6602f7c03..a9951f3b8 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -26,7 +26,7 @@ class YourPornIE(InfoExtractor): self._search_regex( r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info', group='data'), - video_id)[video_id]) + video_id)[video_id]).replace('/cdn/', '/cdn2/') title = (self._search_regex( r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', From 33cc1ea586480ccc60fd25f2d42cfb44eec605c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 00:00:06 +0700 Subject: [PATCH 117/159] [nrktv] Relax _VALID_URL (closes #18304, closes #18387) --- youtube_dl/extractor/nrk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index a231735fb..c5001ef48 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -248,7 +248,7 @@ class NRKTVIE(NRKBaseIE): _VALID_URL = r'''(?x) https?:// (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie/[^/]+|program)/ + (?:serie(?:/[^/]+){1,2}|program)/ (?![Ee]pisodes)%s (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? @@ -362,6 +362,9 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, }] From 15699ec8b0a9cb8d519b721a5cb8199afe62fc3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 00:49:24 +0700 Subject: [PATCH 118/159] [nrktv:season,series] Fix extraction and update tests (closes #17159, closes #17258) --- youtube_dl/extractor/nrk.py | 68 +++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index c5001ef48..48bc6fd7a 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -211,13 +211,13 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '2f7f6eeb2aacdd99885f355428715cfa', + 'md5': '706f34cdf1322577589e369e522b50ef', 'info_dict': { 'id': '150533', 'ext': 'mp4', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 263, + 'duration': 262, } }, { # audio @@ -256,14 +256,14 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '4e9ca6629f09e588ed240fb11619922a', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, - 'series': '20 spørsmål - TV', + 'series': '20 spørsmål', 'episode': '23.05.2014', }, }, { @@ -301,7 +301,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515AH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 772, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -314,7 +314,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515BH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 6175, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -326,7 +326,7 @@ class NRKTVIE(NRKBaseIE): 'info_dict': { 'id': 'MSPO40010515', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', }, 'expected_warnings': ['Video is geo restricted'], }, { @@ -406,21 +406,35 @@ class NRKTVSerieBaseIE(InfoExtractor): def _extract_series(self, webpage, display_id, fatal=True): config = self._parse_json( self._search_regex( - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config', - default='{}' if not fatal else NO_DEFAULT), + (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;', + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'), + webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False) if not config: return - return try_get(config, lambda x: x['series'], dict) + return try_get( + config, + (lambda x: x['initialState']['series'], lambda x: x['series']), + dict) + + def _extract_seasons(self, seasons): + if not isinstance(seasons, list): + return [] + entries = [] + for season in seasons: + entries.extend(self._extract_episodes(season)) + return entries def _extract_episodes(self, season): - entries = [] if not isinstance(season, dict): - return entries - episodes = season.get('episodes') - if not isinstance(episodes, list): - return entries - for episode in episodes: + return [] + return self._extract_entries(season.get('episodes')) + + def _extract_entries(self, entry_list): + if not isinstance(entry_list, list): + return [] + entries = [] + for episode in entry_list: nrk_id = episode.get('prfId') if not nrk_id or not isinstance(nrk_id, compat_str): continue @@ -465,7 +479,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' _TESTS = [{ - # new layout + # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', 'info_dict': { 'id': 'backstage', @@ -474,20 +488,21 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, 'playlist_mincount': 60, }, { - # old layout + # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', 'info_dict': { 'id': 'groenn-glede', 'title': 'Grønn glede', 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', }, - 'playlist_mincount': 9, + 'playlist_mincount': 10, }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', + # old layout + 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', + 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', }, 'playlist_mincount': 3, }, { @@ -520,11 +535,11 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): description = try_get( series, lambda x: x['titles']['subtitle'], compat_str) entries = [] - for season in series['seasons']: - entries.extend(self._extract_episodes(season)) + entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_entries(series.get('instalments'))) return self.playlist_result(entries, series_id, title, description) - # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) + # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) entries = [ self.url_result( 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( @@ -536,6 +551,9 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'seriestitle', webpage, 'title', default=None) or self._og_search_title( webpage, fatal=False) + if title: + title = self._search_regex( + r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) description = self._html_search_meta( 'series_description', webpage, @@ -596,7 +614,7 @@ class NRKPlaylistIE(NRKPlaylistBaseIE): 'title': 'Rivertonprisen til Karin Fossum', 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', }, - 'playlist_count': 5, + 'playlist_count': 2, }] def _extract_title(self, webpage): From c976873c5b2912c06ff53e5193640ee8627edee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 00:54:58 +0700 Subject: [PATCH 119/159] [nrktv:series] Add support for extra materials --- youtube_dl/extractor/nrk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 48bc6fd7a..072f920a9 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -537,6 +537,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): entries = [] entries.extend(self._extract_seasons(series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) + entries.extend(self._extract_episodes(series.get('extraMaterial'))) return self.playlist_result(entries, series_id, title, description) # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) From dfe0a3a9d2e07ac1e5ad221912d03b999ebb4d75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 03:27:11 +0700 Subject: [PATCH 120/159] [lecturio] Add extractor (closes #18405) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/lecturio.py | 186 +++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 youtube_dl/extractor/lecturio.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0baed6b27..e5f18a75d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -553,6 +553,10 @@ from .lcp import ( ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, +) from .leeco import ( LeIE, LePlaylistIE, diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py new file mode 100644 index 000000000..62ff28e02 --- /dev/null +++ b/youtube_dl/extractor/lecturio.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + extract_attributes, + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class LecturioBaseIE(InfoExtractor): + _LOGIN_URL = 'https://app.lecturio.com/en/login' + _NETRC_MACHINE = 'lecturio' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + # Sets some cookies + _, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(url_handle): + return self._LOGIN_URL not in compat_str(url_handle.geturl()) + + # Already logged in + if is_logged(urlh): + return + + login_form = { + 'signin[email]': username, + 'signin[password]': password, + 'signin[remember]': 'on', + } + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + # Logged in successfully + if is_logged(urlh): + return + + errors = self._html_search_regex( + r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, + 'errors', default=None) + if errors: + raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.lecture' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', + 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', + 'info_dict': { + 'id': '39634', + 'ext': 'mp4', + 'title': 'Important Concepts and Terms – Introduction to Microbiology', + }, + 'skip': 'Requires lecturio account credentials', + } + + _CC_LANGS = { + 'German': 'de', + 'English': 'en', + 'Spanish': 'es', + 'French': 'fr', + 'Polish': 'pl', + 'Russian': 'ru', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, + display_id) + + lecture_id = self._search_regex( + r'lecture_id\s*=\s*(?:L_)?(\d+)', webpage, 'lecture id') + + api_url = self._search_regex( + r'lectureDataLink\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'api url', group='url') + + video = self._download_json(api_url, display_id) + + title = video['title'].strip() + + formats = [] + for format_ in video['content']['media']: + if not isinstance(format_, dict): + continue + file_ = format_.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'smil': + # smil contains only broken RTMP formats anyway + continue + file_url = url_or_none(file_) + if not file_url: + continue + label = str_or_none(format_.get('label')) + filesize = int_or_none(format_.get('fileSize')) + formats.append({ + 'url': file_url, + 'format_id': label, + 'filesize': float_or_none(filesize, invscale=1000) + }) + self._sort_formats(formats) + + subtitles = {} + automatic_captions = {} + cc = self._parse_json( + self._search_regex( + r'subtitleUrls\s*:\s*({.+?})\s*,', webpage, 'subtitles', + default='{}'), display_id, fatal=False) + for cc_label, cc_url in cc.items(): + cc_url = url_or_none(cc_url) + if not cc_url: + continue + sub_dict = automatic_captions if 'auto-translated' in cc_label else subtitles + lang = self._search_regex( + r'/([a-z]{2})_', cc_url, 'lang', default=cc_label.split()[0]) + sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ + 'url': cc_url, + }) + + return { + 'id': lecture_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class LecturioCourseIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.course' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', + 'info_dict': { + 'id': 'microbiology-introduction', + 'title': 'Microbiology: Introduction', + }, + 'playlist_count': 45, + 'skip': 'Requires lecturio account credentials', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)<[^>]+\bdata-url=(["\'])(?:(?!\1).)+\.lecture\b[^>]+>', + webpage): + params = extract_attributes(mobj.group(0)) + lecture_url = urljoin(url, params.get('data-url')) + lecture_id = params.get('data-id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r'<span[^>]+class=["\']content-title[^>]+>([^<]+)', webpage, + 'title', default=None) + + return self.playlist_result(entries, display_id, title) From ebb0449049c198f04103502c95a13171b854d1c7 Mon Sep 17 00:00:00 2001 From: ealgase <mostdigitsofpi@gmail.com> Date: Thu, 6 Dec 2018 15:36:08 -0500 Subject: [PATCH 121/159] [xvideos] Switch to HTTPS (closes #18422) --- youtube_dl/extractor/xvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index efee95651..ec2d913fc 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -45,7 +45,7 @@ class XVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.xvideos.com/video%s/' % video_id, video_id) + 'https://www.xvideos.com/video%s/' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: From 8c5879715f4d979b83c49d44a9094307247097ba Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Thu, 6 Dec 2018 21:41:02 +0100 Subject: [PATCH 122/159] [ard:mediathek] Fix title and description extraction (closes #18349) --- youtube_dl/extractor/ard.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf8f61eb..84e96f769 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -173,13 +173,18 @@ class ARDMediathekIE(InfoExtractor): title = self._html_search_regex( [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', r'<meta name="dcterms\.title" content="(.*?)"/>', - r'<h4 class="headline">(.*?)</h4>'], + r'<h4 class="headline">(.*?)</h4>', + r'<title[^>]*>(.*?)'], webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( - 'description', webpage, 'meta description') + 'description', webpage, 'meta description', default=None) + if description is None: + description = self._html_search_regex( + r'(.+?)

', + webpage, 'teaser text', default=None) # Thumbnail is sometimes not present. # It is in the mobile version, but that seems to use a different URL From c3c098dcf2826ab4d668a92c9137cca2c0c42a4f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 7 Dec 2018 18:52:01 +0100 Subject: [PATCH 123/159] [hotstar] fix video data extraction(closes #18386) --- youtube_dl/extractor/hotstar.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 45aa5e7ea..8de9c4faf 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -79,7 +79,7 @@ class HotStarIE(HotStarBaseIE): r'', webpage, 'app state'), video_id) video_data = {} - getters = ( + getters = list( lambda x, k=k: x['initialState']['content%s' % k]['content'] for k in ('Data', 'Detail') ) @@ -87,6 +87,7 @@ class HotStarIE(HotStarBaseIE): content = try_get(v, getters, dict) if content and content.get('contentId') == video_id: video_data = content + break title = video_data['title'] From 9235b5091cedcc21c8dc32d4b292340edeee4ed0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 8 Dec 2018 23:57:40 +0700 Subject: [PATCH 124/159] [iprima] Relax _VALID_URL (closes #18453) --- youtube_dl/extractor/iprima.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 1d58d6e85..11a6629d2 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -12,7 +12,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:play|prima)\.iprima\.cz/(?:.+/)?(?P[^?#]+)' + _VALID_URL = r'https?://(?:play|prima|www)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_BYPASS = False _TESTS = [{ @@ -41,6 +41,9 @@ class IPrimaIE(InfoExtractor): # iframe prima.iprima.cz 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', 'only_matching': True, + }, { + 'url': 'http://www.iprima.cz/filmy/desne-rande', + 'only_matching': True, }] def _real_extract(self, url): From 1d88b3e6e6e59e4b52305faf6c1bf1fd69c555ee Mon Sep 17 00:00:00 2001 From: aegamesi Date: Sat, 29 Apr 2017 22:56:33 -0700 Subject: [PATCH 125/159] [YoutubeDL] Recognize expires=0 as session cookies and send session cookies with requests --- youtube_dl/YoutubeDL.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 38ba43a97..2433f74f4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2300,7 +2300,13 @@ class YoutubeDL(object): self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load() + self.cookiejar.load(ignore_discard=True, ignore_expires=True) + # Force CookieJar to treat 'expires=0' cookies as session/discard cookies + # Fixes https://bugs.python.org/issue17164 + for cookie in self.cookiejar: + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: From 1bab3437046646da4ebe2b8e0c7fdc25aa1072ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 06:00:32 +0700 Subject: [PATCH 126/159] [YoutubeDL] Introduce YoutubeDLCookieJar and clarify the rationale behind session cookies (closes #12929) --- youtube_dl/YoutubeDL.py | 12 +++--------- youtube_dl/utils.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2433f74f4..4493fd0e1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -88,6 +88,7 @@ from .utils import ( version_tuple, write_json_file, write_string, + YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, ) @@ -558,7 +559,7 @@ class YoutubeDL(object): self.restore_console_title() if self.params.get('cookiefile') is not None: - self.cookiejar.save() + self.cookiejar.save(ignore_discard=True, ignore_expires=True) def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -2297,16 +2298,9 @@ class YoutubeDL(object): self.cookiejar = compat_cookiejar.CookieJar() else: opts_cookiefile = expand_path(opts_cookiefile) - self.cookiejar = compat_cookiejar.MozillaCookieJar( - opts_cookiefile) + self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): self.cookiejar.load(ignore_discard=True, ignore_expires=True) - # Force CookieJar to treat 'expires=0' cookies as session/discard cookies - # Fixes https://bugs.python.org/issue17164 - for cookie in self.cookiejar: - if cookie.expires == 0: - cookie.expires = None - cookie.discard = True cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0b1c7cd6c..62e769fd5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_HTMLParser, compat_basestring, compat_chr, + compat_cookiejar, compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, compat_expanduser, @@ -1139,6 +1140,33 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) +class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + # Store session cookies with `expires` set to 0 instead of an empty + # string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): def __init__(self, cookiejar=None): compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) From 5f47a60c5d66d65b505131a672a3bad67ddaa00f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 9 Dec 2018 09:35:17 +0100 Subject: [PATCH 127/159] [imgur] improve gallery and album detection and extraction(closes #9133)(closes #16577)(closes #17223)(closes #18404) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/imgur.py | 86 +++++++++++++++--------------- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5f18a75d..e28db6968 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -483,6 +483,7 @@ from .imdb import ( from .imgur import ( ImgurIE, ImgurAlbumIE, + ImgurGalleryIE, ) from .ina import InaIE from .inc import IncIE diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index ecc958a17..0eb54db3f 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -20,28 +20,9 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', - 'info_dict': { - 'id': 'A61SaA1', - 'ext': 'mp4', - 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', - }, - }, { - 'url': 'https://imgur.com/gallery/YcAQlkx', - 'info_dict': { - 'id': 'YcAQlkx', - 'ext': 'mp4', - 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - } - }, { - 'url': 'http://imgur.com/topic/Funny/N8rOudd', - 'only_matching': True, - }, { - 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, }, { 'url': 'https://i.imgur.com/crGpqCV.mp4', @@ -50,8 +31,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) - webpage = self._download_webpage(gifv_url, video_id) + webpage = self._download_webpage( + 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -72,7 +53,6 @@ class ImgurIE(InfoExtractor): 'format_id': m.group('type').partition('/')[2], 'url': self._proto_relative_url(m.group('src')), 'ext': mimetype2ext(m.group('type')), - 'acodec': 'none', 'width': width, 'height': height, 'http_headers': { @@ -107,44 +87,64 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage, default=None), 'title': self._og_search_title(webpage), } -class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{5})(?:[/?#&]+)?$' +class ImgurGalleryIE(InfoExtractor): + IE_NAME = 'imgur:gallery' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', + 'title': 'Adding faces make every GIF better', }, 'playlist_count': 25, }, { - 'url': 'http://imgur.com/a/j6Orj', + 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, }, { - 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, }] def _real_extract(self, url): - album_id = self._match_id(url) + gallery_id = self._match_id(url) - album_images = self._download_json( - 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id, fatal=False) + data = self._download_json( + 'https://imgur.com/gallery/%s.json' % gallery_id, + gallery_id)['data']['image'] - if album_images: - data = album_images.get('data') - if data and isinstance(data, dict): - images = data.get('images') - if images and isinstance(images, list): - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in images if image.get('hash')] - return self.playlist_result(entries, album_id) + if data.get('is_album'): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) + for image in data['album_images']['images'] if image.get('hash')] + return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) - # Fallback to single video - return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) + return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) + + +class ImgurAlbumIE(ImgurGalleryIE): + IE_NAME = 'imgur:album' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://imgur.com/a/j6Orj', + 'info_dict': { + 'id': 'j6Orj', + 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', + }, + 'playlist_count': 12, + }] From 3ad6dabd33307c0125fd462c4988083e360c40ad Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 9 Dec 2018 10:04:00 +0100 Subject: [PATCH 128/159] [aenetworks] add support for History Vault(closes #18460) --- youtube_dl/extractor/aenetworks.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 398e56ea3..85ec6392d 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -22,18 +22,19 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' - IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' _VALID_URL = r'''(?x) https?:// (?:www\.)? (?P - (?:history|aetv|mylifetime|lifetimemovieclub)\.com| + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| fyi\.tv )/ (?: shows/(?P[^/]+(?:/[^/]+){0,2})| movies/(?P[^/]+)(?:/full-movie)?| - specials/(?P[^/]+)/full-special + specials/(?P[^/]+)/full-special| + collections/[^/]+/(?P[^/]+) ) ''' _TESTS = [{ @@ -80,6 +81,9 @@ class AENetworksIE(AENetworksBaseIE): }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', + 'only_matching': True }] _DOMAIN_TO_REQUESTOR_ID = { 'history.com': 'HISTORY', @@ -90,9 +94,9 @@ class AENetworksIE(AENetworksBaseIE): } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id - webpage = self._download_webpage(url, display_id) + domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id or special_display_id or collection_display_id + webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) if show_path: url_parts = show_path.split('/') url_parts_len = len(url_parts) From 5ee7ae5c7577c9c137a7b38edd5ad01ae3a40376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 22:28:24 +0700 Subject: [PATCH 129/159] [teachable] Add support for teachable based platform sites (closes #5451, closes #18150, closes #18272) --- youtube_dl/extractor/extractors.py | 8 +- youtube_dl/extractor/generic.py | 5 + .../extractor/{upskill.py => teachable.py} | 129 ++++++++++++++---- 3 files changed, 115 insertions(+), 27 deletions(-) rename youtube_dl/extractor/{upskill.py => teachable.py} (52%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e28db6968..6a5d12ab1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1091,6 +1091,10 @@ from .tass import TassIE from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -1240,10 +1244,6 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) -from .upskill import ( - UpskillIE, - UpskillCourseIE, -) from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 59cf03faf..65b482333 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -109,6 +109,7 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .teachable import TeachableIE from .indavideo import IndavideoEmbedIE from .apa import APAIE from .foxnews import FoxNewsIE @@ -3112,6 +3113,10 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/teachable.py similarity index 52% rename from youtube_dl/extractor/upskill.py rename to youtube_dl/extractor/teachable.py index 30297b4dd..a0c46b2e6 100644 --- a/youtube_dl/extractor/upskill.py +++ b/youtube_dl/extractor/teachable.py @@ -14,20 +14,38 @@ from ..utils import ( ) -class UpskillBaseIE(InfoExtractor): - _LOGIN_URL = 'http://upskillcourses.com/sign_in' - _NETRC_MACHINE = 'upskill' +class TeachableBaseIE(InfoExtractor): + _NETRC_MACHINE = 'teachable' + _URL_PREFIX = 'teachable:' + + _SITES = { + # Only notable ones here + 'upskillcourses.com': 'upskill', + 'academy.gns3.com': 'gns3', + 'academyhacker.com': 'academyhacker', + 'stackskills.com': 'stackskills', + 'market.saleshacker.com': 'saleshacker', + 'learnability.org': 'learnability', + 'edurila.com': 'edurila', + } + + _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) def _real_initialize(self): - self._login() + self._logged_in = False - def _login(self): - username, password = self._get_login_info() + def _login(self, site): + if self._logged_in: + return + + username, password = self._get_login_info( + netrc_machine=self._SITES.get(site, site)) if username is None: return login_page, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login page') + 'https://%s/sign_in' % site, None, + 'Downloading %s login page' % site) login_url = compat_str(urlh.geturl()) @@ -46,18 +64,24 @@ class UpskillBaseIE(InfoExtractor): post_url = urljoin(login_url, post_url) response = self._download_webpage( - post_url, None, 'Logging in', + post_url, None, 'Logging in to %s' % site, data=urlencode_postdata(login_form), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': login_url, }) + if '>I accept the new Privacy Policy<' in response: + raise ExtractorError( + 'Unable to login: %s asks you to accept new Privacy Policy. ' + 'Go to https://%s/ and accept.' % (site, site), expected=True) + # Successful login if any(re.search(p, response) for p in ( r'class=["\']user-signout', r']+\bhref=["\']/sign_out', r'>\s*Log out\s*<')): + self._logged_in = True return message = get_element_by_class('alert', response) @@ -68,8 +92,14 @@ class UpskillBaseIE(InfoExtractor): raise ExtractorError('Unable to log in') -class UpskillIE(UpskillBaseIE): - _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P\d+)' +class TeachableIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P[^/]+)| + https?://(?:www\.)?(?P%s) + ) + /courses/[^/]+/lectures/(?P\d+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', @@ -77,7 +107,7 @@ class UpskillIE(UpskillBaseIE): 'id': 'uzw6zw58or', 'ext': 'mp4', 'title': 'Welcome to the Course!', - 'description': 'md5:8d66c13403783370af62ca97a7357bdd', + 'description': 'md5:65edb0affa582974de4625b9cdea1107', 'duration': 138.763, 'timestamp': 1479846621, 'upload_date': '20161122', @@ -88,10 +118,38 @@ class UpskillIE(UpskillBaseIE): }, { 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'only_matching': True, + }, { + 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'only_matching': True, }] + @staticmethod + def _is_teachable(webpage): + return 'teachableTracker.linker:autoLink' in webpage and re.search( + r']+href=["\']https?://process\.fs\.teachablecdn\.com', + webpage) + + @staticmethod + def _extract_url(webpage, source_url): + if not TeachableIE._is_teachable(webpage): + print('NOT TEACHABLE') + return + if re.match(r'https?://[^/]+/(?:courses|p)', source_url): + return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + video_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + url = url[len(self._URL_PREFIX):] webpage = self._download_webpage(url, video_id) @@ -113,12 +171,18 @@ class UpskillIE(UpskillBaseIE): } -class UpskillCourseIE(UpskillBaseIE): - _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P[^/?#&]+)' +class TeachableCourseIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P[^/]+)| + https?://(?:www\.)?(?P%s) + ) + /(?:courses|p)/(?:enrolled/)?(?P[^/?#&]+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { - 'id': '119763', + 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, @@ -128,21 +192,37 @@ class UpskillCourseIE(UpskillBaseIE): }, { 'url': 'http://upskillcourses.com/courses/enrolled/119763', 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'only_matching': True, + }, { + 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', + 'only_matching': True, + }, { + 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return False if UpskillIE.suitable(url) else super( - UpskillCourseIE, cls).suitable(url) + return False if TeachableIE.suitable(url) else super( + TeachableCourseIE, cls).suitable(url) def _real_extract(self, url): - course_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + course_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + prefix = self._URL_PREFIX + url = url[len(prefix):] webpage = self._download_webpage(url, course_id) - course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id', - default=course_id) + url_base = 'https://%s/' % site entries = [] @@ -162,10 +242,13 @@ class UpskillCourseIE(UpskillBaseIE): title = self._html_search_regex( r']+class=["\']lecture-name[^>]+>([^<]+)', li, 'title', default=None) + entry_url = urljoin(url_base, lecture_url) + if prefixed: + entry_url = self._URL_PREFIX + entry_url entries.append( self.url_result( - urljoin('http://upskillcourses.com/', lecture_url), - ie=UpskillIE.ie_key(), video_id=lecture_id, + entry_url, + ie=TeachableIE.ie_key(), video_id=lecture_id, video_title=clean_html(title))) course_title = self._html_search_regex( From 9e02c2c704d15d56b92ebe2fc8d481f99f5d106d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 22:56:33 +0700 Subject: [PATCH 130/159] [YoutubeDLCookieJar] Add test for keeping session cookies --- test/test_YoutubeDLCookieJar.py | 34 +++++++++++++++++++++++ test/testdata/cookies/session_cookies.txt | 6 ++++ 2 files changed, 40 insertions(+) create mode 100644 test/test_YoutubeDLCookieJar.py create mode 100644 test/testdata/cookies/session_cookies.txt diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py new file mode 100644 index 000000000..6a8243590 --- /dev/null +++ b/test/test_YoutubeDLCookieJar.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +import os +import re +import sys +import tempfile +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import YoutubeDLCookieJar + + +class TestYoutubeDLCookieJar(unittest.TestCase): + def test_keep_session_cookies(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + tf = tempfile.NamedTemporaryFile(delete=False) + try: + cookiejar.save(filename=tf.name, ignore_discard=True, ignore_expires=True) + temp = tf.read().decode('utf-8') + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpiresEmpty\s+YoutubeDLExpiresEmptyValue', temp)) + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpires0\s+YoutubeDLExpires0Value', temp)) + finally: + tf.close() + os.remove(tf.name) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/testdata/cookies/session_cookies.txt b/test/testdata/cookies/session_cookies.txt new file mode 100644 index 000000000..91e5c9231 --- /dev/null +++ b/test/testdata/cookies/session_cookies.txt @@ -0,0 +1,6 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpires0 YoutubeDLExpires0Value +www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue From 24cc64254c03c56229654b13d833c600d7048bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 23:08:16 +0700 Subject: [PATCH 131/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 689d07826..6afee3a28 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version + +Core +* [YoutubeDL] Keep session cookies in cookie file between runs +* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) + +Extractors ++ [teachable] Add support for teachable platform sites (#5451, #18150, #18272) ++ [aenetworks] Add support for historyvault.com (#18460) +* [imgur] Improve gallery and album detection and extraction (#9133, #16577, + #17223, #18404) +* [iprima] Relax URL regular expression (#18453) +* [hotstar] Fix video data extraction (#18386) +* [ard:mediathek] Fix title and description extraction (#18349, #18371) +* [xvideos] Switch to HTTPS (#18422, #18427) ++ [lecturio] Add support for lecturio.com (#18405) ++ [nrktv:series] Add support for extra materials +* [nrktv:season,series] Fix extraction (#17159, #17258) +* [nrktv] Relax URL regular expression (#18304, #18387) +* [yourporn] Fix extraction (#18424, #18425) +* [tbs] Fix info extraction (#18403) ++ [gamespot] Add support for review URLs + + version 2018.12.03 Core From cefe42c412168e95b78dcefc5cdba608dba7dd02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 23:11:32 +0700 Subject: [PATCH 132/159] release 2018.12.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 11 +++++++---- youtube_dl/version.py | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 4b35244a8..1ccf410a7 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.03** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.12.03 +[debug] youtube-dl version 2018.12.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 6afee3a28..9aef67b52 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.12.09 Core * [YoutubeDL] Keep session cookies in cookie file between runs diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 837b0199b..31d20f255 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -33,7 +33,7 @@ - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network + - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** @@ -376,7 +376,8 @@ - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** - - **ImgurAlbum** + - **imgur:album** + - **imgur:gallery** - **Ina** - **Inc** - **IndavideoEmbed** @@ -435,6 +436,8 @@ - **Le**: 乐视网 - **Learnr** - **Lecture2Go** + - **Lecturio** + - **LecturioCourse** - **LEGO** - **Lemonde** - **Lenta** @@ -853,6 +856,8 @@ - **TastyTrade** - **TBS** - **TDSLifeway** + - **Teachable** + - **TeachableCourse** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** @@ -961,8 +966,6 @@ - **uol.com.br** - **uplynk** - **uplynk:preplay** - - **Upskill** - - **UpskillCourse** - **Urort**: NRK P3 Urørt - **URPlay** - **USANetwork** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8e1203892..d15b9583f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.12.03' +__version__ = '2018.12.09' From 59c3940165efaee6705ff06ef4fc4ac2c701107d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Dec 2018 01:37:10 +0700 Subject: [PATCH 133/159] [ard:mediathek] Add support for classic.ardmediathek.de (closes #18473) --- youtube_dl/extractor/ard.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 84e96f769..2e1536a1b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -21,7 +21,7 @@ from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ # available till 26.07.2022 @@ -51,6 +51,9 @@ class ARDMediathekIE(InfoExtractor): # audio 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, }] def _extract_media_info(self, media_info_url, webpage, video_id): From 6e29458f2479c4535f8605032b9492f52bb37f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Dec 2018 04:30:00 +0700 Subject: [PATCH 134/159] [test/testdata/cookies/session_cookies.txt] Fix empty expires test data --- test/testdata/cookies/session_cookies.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/testdata/cookies/session_cookies.txt b/test/testdata/cookies/session_cookies.txt index 91e5c9231..f6996f031 100644 --- a/test/testdata/cookies/session_cookies.txt +++ b/test/testdata/cookies/session_cookies.txt @@ -2,5 +2,5 @@ # http://curl.haxx.se/rfc/cookie_spec.html # This is a generated file! Do not edit. +www.foobar.foobar FALSE / TRUE YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpires0 YoutubeDLExpires0Value -www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue From 102a4e54c5c0819233773cc15398bc901a218f4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Dec 2018 10:10:28 +0700 Subject: [PATCH 135/159] [teachable] Remove debug output --- youtube_dl/extractor/teachable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index a0c46b2e6..47ac95ee8 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -135,7 +135,6 @@ class TeachableIE(TeachableBaseIE): @staticmethod def _extract_url(webpage, source_url): if not TeachableIE._is_teachable(webpage): - print('NOT TEACHABLE') return if re.match(r'https?://[^/]+/(?:courses|p)', source_url): return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) From 13e17cd28e7f9e3bd8be4fa7b073a5cb96f5959f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 10 Dec 2018 14:59:57 +0100 Subject: [PATCH 136/159] [uol] fix format url extraction(closes 18480) --- youtube_dl/extractor/uol.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index e67083004..08f0c072e 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -61,7 +61,7 @@ class UOLIE(InfoExtractor): 'height': 360, }, '5': { - 'width': 1080, + 'width': 1280, 'height': 720, }, '6': { @@ -80,6 +80,10 @@ class UOLIE(InfoExtractor): 'width': 568, 'height': 320, }, + '11': { + 'width': 640, + 'height': 360, + } } def _real_extract(self, url): @@ -111,19 +115,31 @@ class UOLIE(InfoExtractor): 'ver': video_data.get('numRevision', 2), 'r': 'http://mais.uol.com.br', } + for k in ('token', 'sign'): + v = video_data.get(k) + if v: + query[k] = v + formats = [] for f in video_data.get('formats', []): f_url = f.get('url') or f.get('secureUrl') if not f_url: continue + f_url = update_url_query(f_url, query) format_id = str_or_none(f.get('id')) + if format_id == '10': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + continue fmt = { 'format_id': format_id, - 'url': update_url_query(f_url, query), + 'url': f_url, + 'source_preference': 1, } fmt.update(self._FORMATS.get(format_id, {})) formats.append(fmt) - self._sort_formats(formats) + self._sort_formats(formats, ('height', 'width', 'source_preference', 'tbr', 'ext')) tags = [] for tag in video_data.get('tags', []): From 0a05cfabb6338be750474a7286ce0d72a4d7c142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Dec 2018 23:45:02 +0700 Subject: [PATCH 137/159] [lecturio] Improve subtitles extraction (closes #18488) --- youtube_dl/extractor/lecturio.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 62ff28e02..0f1265cdf 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -136,9 +136,15 @@ class LecturioIE(LecturioBaseIE): cc_url = url_or_none(cc_url) if not cc_url: continue - sub_dict = automatic_captions if 'auto-translated' in cc_label else subtitles lang = self._search_regex( - r'/([a-z]{2})_', cc_url, 'lang', default=cc_label.split()[0]) + r'/([a-z]{2})_', cc_url, 'lang', + default=cc_label.split()[0] if cc_label else 'en') + original_lang = self._search_regex( + r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', + default=None) + sub_dict = (automatic_captions + if 'auto-translated' in cc_label or original_lang + else subtitles) sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ 'url': cc_url, }) From 8fe104947d6b772ccacd0b8af51d1adb16865e99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Dec 2018 22:25:12 +0700 Subject: [PATCH 138/159] [youtube] Fix multifeed extraction (closes #18531) --- youtube_dl/extractor/youtube.py | 52 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3f49f3889..c582ab2ff 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1712,30 +1712,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_description = '' - if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): - entries = [] - feed_ids = [] - multifeed_metadata_list = video_info['multifeed_metadata_list'][0] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/rg3/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + multifeed_metadata_list = try_get( + player_response, + lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], + compat_str) or try_get( + video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + if multifeed_metadata_list: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/rg3/youtube-dl/issues/8536) + feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + else: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) if view_count is None: view_count = extract_view_count(video_info) From 7f41a598b3fba1bcab2817de64a08941200aa3c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Dec 2018 23:08:14 +0700 Subject: [PATCH 139/159] [safari] Add support for learning.oreilly.com (closes #18510) --- youtube_dl/extractor/safari.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 30e2a38b4..c0d32a1b9 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -15,10 +15,10 @@ from ..utils import ( class SafariBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' _NETRC_MACHINE = 'safari' - _API_BASE = 'https://www.safaribooksonline.com/api/v1' + _API_BASE = 'https://learning.oreilly.com/api/v1' _API_FORMAT = 'json' LOGGED_IN = False @@ -76,7 +76,7 @@ class SafariIE(SafariBaseIE): IE_DESC = 'safaribooksonline.com online video' _VALID_URL = r'''(?x) https?:// - (?:www\.)?safaribooksonline\.com/ + (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ (?: library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) @@ -104,6 +104,9 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, }] _PARTNER_ID = '1926081' @@ -160,7 +163,7 @@ class SafariIE(SafariBaseIE): class SafariApiIE(SafariBaseIE): IE_NAME = 'safari:api' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', @@ -185,7 +188,7 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/ + (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ (?: library/view/[^/]+| api/v1/book| @@ -213,6 +216,9 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, }] @classmethod From c984196cf1c9eb34725091e07a4f094b7eea1d4f Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sat, 15 Dec 2018 23:59:17 +0700 Subject: [PATCH 140/159] [README.md] Bind info dict URLs to a fixed blob (closes #18492) --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 35c3de512..a1d2904c0 100644 --- a/README.md +++ b/README.md @@ -1024,7 +1024,7 @@ After you have ensured this site is distributing its content legally, you can fo ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: @@ -1045,7 +1045,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -1053,7 +1053,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example From 21c340b83fb41094ef59b87d52c4eb1d90d1df04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 Dec 2018 19:35:48 +0700 Subject: [PATCH 141/159] [youtube] Fix mark watched (closes #18546) --- youtube_dl/extractor/youtube.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c582ab2ff..44c25c11c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -48,6 +48,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + url_or_none, urlencode_postdata, ) @@ -1386,8 +1387,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(err_msg) return {} - def _mark_watched(self, video_id, video_info): - playback_url = video_info.get('videostats_playback_base_url', [None])[0] + def _mark_watched(self, video_id, video_info, player_response): + playback_url = url_or_none(try_get( + player_response, + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( + video_info, lambda x: x['videostats_playback_base_url'][0])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -2122,7 +2126,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._sort_formats(formats) - self.mark_watched(video_id, video_info) + self.mark_watched(video_id, video_info, player_response) return { 'id': video_id, From c8b37510868b20b0829583557899d70db2ea6243 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 16 Dec 2018 14:28:40 +0100 Subject: [PATCH 142/159] [vrv] fix initial state extraction --- youtube_dl/extractor/vrv.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index ac0819c7c..483a3be3a 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -120,8 +120,10 @@ class VRVIE(VRVBaseIE): url, video_id, headers=self.geo_verification_headers()) media_resource = self._parse_json(self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})', - webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} + [ + r'window\.__INITIAL_STATE__\s*=\s*({.+?})(?:|;)', + r'window\.__INITIAL_STATE__\s*=\s*({.+})' + ], webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} video_data = media_resource.get('json') if not video_data: From 90046d77616f8f24e3716c64443118a9dcbec996 Mon Sep 17 00:00:00 2001 From: yonaikerlol Date: Sun, 16 Dec 2018 17:10:36 -0400 Subject: [PATCH 143/159] [iprima] Relax _VALID_URL (closes #18515) --- youtube_dl/extractor/iprima.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 11a6629d2..11bbeb592 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -12,7 +12,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:play|prima|www)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_BYPASS = False _TESTS = [{ @@ -44,6 +44,21 @@ class IPrimaIE(InfoExtractor): }, { 'url': 'http://www.iprima.cz/filmy/desne-rande', 'only_matching': True, + }, { + 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby', + 'only_matching': True, + }, { + 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy', + 'only_matching': True, + }, { + 'url': 'https://cool.iprima.cz/derava-silnice-nevadi', + 'only_matching': True, + }, { + 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', + 'only_matching': True, + }, { + 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', + 'only_matching': True, }] def _real_extract(self, url): From 252e172dea96b90191682afac837535de4d33107 Mon Sep 17 00:00:00 2001 From: Tim Landscheidt Date: Sun, 16 Dec 2018 21:29:12 +0000 Subject: [PATCH 144/159] [acast] Add support for embed.acast.com --- youtube_dl/extractor/acast.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 6d846ea7a..1fbff705d 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -17,25 +17,8 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' - _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' + _VALID_URL = r'https?://(?:(?:embed|www)\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' _TESTS = [{ - # test with one bling - 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan', - 'md5': 'ada3de5a1e3a2a381327d749854788bb', - 'info_dict': { - 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', - 'ext': 'mp3', - 'title': '"Where Are You?": Taipei 101, Taiwan', - 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', - 'timestamp': 1196172000, - 'upload_date': '20071127', - 'duration': 211, - 'creator': 'Concierge', - 'series': 'Condé Nast Traveler Podcast', - 'episode': '"Where Are You?": Taipei 101, Taiwan', - } - }, { - # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', 'info_dict': { @@ -50,6 +33,9 @@ class ACastIE(InfoExtractor): 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', } + }, { + 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', + 'only_matching': True, }] def _real_extract(self, url): From 50a498a68e2f6754f5b26e5ad1f6eabfd9aeeb14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 04:32:59 +0700 Subject: [PATCH 145/159] [acast] Extend _VALID_URL --- youtube_dl/extractor/acast.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 1fbff705d..b32f74a37 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -17,7 +17,14 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' - _VALID_URL = r'https?://(?:(?:embed|www)\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:embed|www)\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P[^/]+)/(?P[^/#?]+) + ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', @@ -36,6 +43,9 @@ class ACastIE(InfoExtractor): }, { 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'only_matching': True, }] def _real_extract(self, url): From 1c82122741783adf6653df25fa81ef0f95a22279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 04:51:57 +0700 Subject: [PATCH 146/159] [ard:beta] Relax _VALID_URL (closes #18441) --- youtube_dl/extractor/ard.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2e1536a1b..a5df7f0f9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -56,6 +56,10 @@ class ARDMediathekIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( media_info_url, video_id, 'Downloading media JSON') @@ -296,7 +300,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(InfoExtractor): - _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P[a-zA-Z0-9]+)/(?P[^/?#]+)' + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', @@ -310,12 +314,18 @@ class ARDBetaMediathekIE(InfoExtractor): 'upload_date': '20180826', 'ext': 'mp4', }, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') From 752582183a1942b12880139039137b3a60962611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 05:29:59 +0700 Subject: [PATCH 147/159] [ard:beta] Improve extraction robustness, fix subtitles extraction, improve geo restricted videos extraction --- youtube_dl/extractor/ard.py | 58 +++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index a5df7f0f9..8adae4644 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,13 +8,16 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, - qualities, int_or_none, parse_duration, + qualities, + str_or_none, + try_get, unified_strdate, - xpath_text, + unified_timestamp, update_url_query, url_or_none, + xpath_text, ) from ..compat import compat_etree_fromstring @@ -336,43 +339,62 @@ class ARDBetaMediathekIE(InfoExtractor): 'display_id': display_id, } formats = [] + subtitles = {} + geoblocked = False for widget in data.values(): - if widget.get('_geoblocked'): - raise ExtractorError('This video is not available due to geoblocking', expected=True) - + if widget.get('_geoblocked') is True: + geoblocked = True if '_duration' in widget: - res['duration'] = widget['_duration'] + res['duration'] = int_or_none(widget['_duration']) if 'clipTitle' in widget: res['title'] = widget['clipTitle'] if '_previewImage' in widget: res['thumbnail'] = widget['_previewImage'] if 'broadcastedOn' in widget: - res['upload_date'] = unified_strdate(widget['broadcastedOn']) + res['timestamp'] = unified_timestamp(widget['broadcastedOn']) if 'synopsis' in widget: res['description'] = widget['synopsis'] - if '_subtitleUrl' in widget: - res['subtitles'] = {'de': [{ + subtitle_url = url_or_none(widget.get('_subtitleUrl')) + if subtitle_url: + subtitles.setdefault('de', []).append({ 'ext': 'ttml', - 'url': widget['_subtitleUrl'], - }]} + 'url': subtitle_url, + }) if '_quality' in widget: - format_url = widget['_stream']['json'][0] - - if format_url.endswith('.f4m'): + format_url = url_or_none(try_get( + widget, lambda x: x['_stream']['json'][0])) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( format_url + '?hdcore=3.11.0', video_id, f4m_id='hds', fatal=False)) - elif format_url.endswith('m3u8'): + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + format_url, video_id, 'mp4', m3u8_id='hls', + fatal=False)) else: + # HTTP formats are not available when geoblocked is True, + # other formats are fine though + if geoblocked: + continue + quality = str_or_none(widget.get('_quality')) formats.append({ - 'format_id': 'http-' + widget['_quality'], + 'format_id': ('http-' + quality) if quality else 'http', 'url': format_url, 'preference': 10, # Plain HTTP, that's nice }) + if not formats and geoblocked: + self.raise_geo_restricted( + msg='This video is not available due to geoblocking', + countries=['DE']) + self._sort_formats(formats) - res['formats'] = formats + res.update({ + 'subtitles': subtitles, + 'formats': formats, + }) return res From cbb3e4b14f59fe2dba5983f7e55308eba2ea025b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 05:34:55 +0700 Subject: [PATCH 148/159] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9aef67b52..291ae42bb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version + +Extractors +* [ard:beta] Improve geo restricted videos extraction +* [ard:beta] Fix subtitles extraction +* [ard:beta] Improve extraction robustness +* [ard:beta] Relax URL regular expression (#18441) +* [acast] Add support for embed.acast.com and play.acast.com (#18483) +* [iprima] Relax URL regular expression (#18515, #18540) +* [vrv] Fix initial state extraction (#18553) +* [youtube] Fix mark watched (#18546) ++ [safari] Add support for learning.oreilly.com (#18510) +* [youtube] Fix multifeed extraction (#18531) +* [lecturio] Improve subtitles extraction (#18488) +* [uol] Fix format URL extraction (#18480) ++ [ard:mediathek] Add support for classic.ardmediathek.de (#18473) + + version 2018.12.09 Core From 4cee62ade0d991eedb2feae927c44370be3c389e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 05:37:50 +0700 Subject: [PATCH 149/159] release 2018.12.17 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1ccf410a7..b84559932 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.17*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.17** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.12.09 +[debug] youtube-dl version 2018.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bbcb78808..cba87190d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -152,7 +152,7 @@ After you have ensured this site is distributing its content legally, you can fo ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: @@ -173,7 +173,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -181,7 +181,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example diff --git a/ChangeLog b/ChangeLog index 291ae42bb..510013e89 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.12.17 Extractors * [ard:beta] Improve geo restricted videos extraction diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d15b9583f..0461aac0c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.12.09' +__version__ = '2018.12.17' From 7216e9bff71b4c537bb8d56386d789bb83f921f9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 17 Dec 2018 16:34:51 +0100 Subject: [PATCH 150/159] [discovery] Add support for Scripps Networks watch domains(closes #17947) --- youtube_dl/extractor/discovery.py | 35 +++++++++++++++++-------- youtube_dl/extractor/scrippsnetworks.py | 29 +++++--------------- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 3589bd428..44fbc41bb 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -17,16 +17,29 @@ from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?P - discovery| - investigationdiscovery| - discoverylife| - animalplanet| - ahctv| - destinationamerica| - sciencechannel| - tlc| - velocity + _VALID_URL = r'''(?x)https?:// + (?P + (?:www\.)? + (?: + discovery| + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc| + velocity + )| + watch\. + (?: + hgtv| + foodnetwork| + travelchannel| + diynetwork| + cookingchanneltv| + motortrend + ) )\.com(?P/tv-shows/[^/]+/(?:video|full-episode)s/(?P[^./?#]+))''' _TESTS = [{ 'url': 'https://www.discovery.com/tv-shows/cash-cab/videos/dave-foley', @@ -71,7 +84,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): if not access_token: access_token = self._download_json( - 'https://www.%s.com/anonymous' % site, display_id, query={ + 'https://%s.com/anonymous' % site, display_id, query={ 'authRel': 'authorization', 'client_id': try_get( react_data, lambda x: x['application']['apiClientId'], diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index 4023aeef8..8b3275735 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -19,7 +19,7 @@ class ScrippsNetworksWatchIE(AWSIE): _VALID_URL = r'''(?x) https?:// watch\. - (?Phgtv|foodnetwork|travelchannel|diynetwork|cookingchanneltv|geniuskitchen)\.com/ + (?Pgeniuskitchen)\.com/ (?: player\.[A-Z0-9]+\.html\#| show/(?:[^/]+/){2}| @@ -28,38 +28,23 @@ class ScrippsNetworksWatchIE(AWSIE): (?P\d+) ''' _TESTS = [{ - 'url': 'http://watch.hgtv.com/show/HGTVE/Best-Ever-Treehouses/2241515/Best-Ever-Treehouses/', - 'md5': '26545fd676d939954c6808274bdb905a', + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', 'info_dict': { - 'id': '4173834', + 'id': '4194875', 'ext': 'mp4', - 'title': 'Best Ever Treehouses', - 'description': "We're searching for the most over the top treehouses.", + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', 'uploader': 'ANV', - 'upload_date': '20170922', - 'timestamp': 1506056400, + 'upload_date': '20171011', + 'timestamp': 1507698000, }, 'params': { 'skip_download': True, }, 'add_ie': [AnvatoIE.ie_key()], - }, { - 'url': 'http://watch.diynetwork.com/show/DSAL/Salvage-Dawgs/2656646/Covington-Church/', - 'only_matching': True, - }, { - 'url': 'http://watch.diynetwork.com/player.HNT.html#2656646', - 'only_matching': True, - }, { - 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', - 'only_matching': True, }] _SNI_TABLE = { - 'hgtv': 'hgtv', - 'diynetwork': 'diy', - 'foodnetwork': 'food', - 'cookingchanneltv': 'cook', - 'travelchannel': 'trav', 'geniuskitchen': 'genius', } From 386d1fea79e861b11c77a7353c07eb162fbb4dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 23:43:12 +0700 Subject: [PATCH 151/159] [lecturio] Add support for lecturio.de (closes #18562) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/lecturio.py | 45 +++++++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6a5d12ab1..d72f52e36 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -557,6 +557,7 @@ from .lecture2go import Lecture2GoIE from .lecturio import ( LecturioIE, LecturioCourseIE, + LecturioDeCourseIE, ) from .leeco import ( LeIE, diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 0f1265cdf..24f78d928 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -64,8 +64,14 @@ class LecturioBaseIE(InfoExtractor): class LecturioIE(LecturioBaseIE): - _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P[^/?#&]+)\.lecture' - _TEST = { + _VALID_URL = r'''(?x) + https:// + (?: + app\.lecturio\.com/[^/]+/(?P[^/?#&]+)\.lecture| + (?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.vortrag + ) + ''' + _TESTS = [{ 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', 'info_dict': { @@ -74,7 +80,10 @@ class LecturioIE(LecturioBaseIE): 'title': 'Important Concepts and Terms – Introduction to Microbiology', }, 'skip': 'Requires lecturio account credentials', - } + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, + }] _CC_LANGS = { 'German': 'de', @@ -86,7 +95,8 @@ class LecturioIE(LecturioBaseIE): } def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('id_de') webpage = self._download_webpage( 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, @@ -190,3 +200,30 @@ class LecturioCourseIE(LecturioBaseIE): 'title', default=None) return self.playlist_result(entries, display_id, title) + + +class LecturioDeCourseIE(LecturioBaseIE): + _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.kurs' + _TEST = { + 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', + 'only_matching': True, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)]+\bdata-lecture-id=["\'](?P\d+).+?\bhref=(["\'])(?P(?:(?!\2).)+\.vortrag)\b[^>]+>', + webpage): + lecture_url = urljoin(url, mobj.group('url')) + lecture_id = mobj.group('id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r']*>([^<]+)', webpage, 'title', default=None) + + return self.playlist_result(entries, display_id, title) From cfd13c4c45f63c408a3e53ac31561c286c9a3acc Mon Sep 17 00:00:00 2001 From: Daan van Vugt Date: Mon, 17 Dec 2018 18:03:00 +0100 Subject: [PATCH 152/159] [mediasite] Relax _VALID_URL --- youtube_dl/extractor/mediasite.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 84876b883..de37df0c6 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -21,7 +21,7 @@ from ..utils import ( class MediasiteIE(InfoExtractor): - _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/livebroadcast/Presentation)/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -84,7 +84,11 @@ class MediasiteIE(InfoExtractor): 'timestamp': 1333983600, 'duration': 7794, } - } + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', + 'only_matching': True, + }, ] # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) From 4ee184545455d1ebf4a9f97f86ade2c4a3b3e03b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 18 Dec 2018 01:55:13 +0700 Subject: [PATCH 153/159] [mediasite] Extend _VALID_URL even more --- youtube_dl/extractor/mediasite.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index de37df0c6..ef9628e65 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -21,7 +21,7 @@ from ..utils import ( class MediasiteIE(InfoExtractor): - _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/livebroadcast/Presentation)/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -89,6 +89,10 @@ class MediasiteIE(InfoExtractor): 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', 'only_matching': True, }, + { + 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', + 'only_matching': True, + }, ] # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) From 65e29cdac3f3ee14e07c200293438426b1135719 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 18 Dec 2018 22:46:19 +0100 Subject: [PATCH 154/159] [twitter] pass referer with card request(closes #18579) --- youtube_dl/extractor/twitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index de41065d6..41d0b6be8 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -171,7 +171,8 @@ class TwitterCardIE(TwitterBaseIE): urls.append('https://twitter.com/i/videos/' + video_id) for u in urls: - webpage = self._download_webpage(u, video_id) + webpage = self._download_webpage( + u, video_id, headers={'Referer': 'https://twitter.com/'}) iframe_url = self._html_search_regex( r']+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', From 904bb599be0fcf5fa861f17eee757c3c9494208b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elan=20Ruusam=C3=A4e?= Date: Wed, 19 Dec 2018 22:22:10 +0200 Subject: [PATCH 155/159] [README.md] Add flake8 instructions --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a1d2904c0..b3c39bf66 100644 --- a/README.md +++ b/README.md @@ -1025,15 +1025,19 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): + + $ flake8 youtube_dl/extractor/yourextractor.py + +9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! From 835e45abab88e2a1661b86be255e06323e88ec7a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 19 Dec 2018 22:07:37 +0100 Subject: [PATCH 156/159] [crackle] extract ism and http formats --- youtube_dl/extractor/crackle.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 8dd9d6687..f73ef6b63 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -48,6 +48,21 @@ class CrackleIE(InfoExtractor): 'only_matching': True, }] + _MEDIA_FILE_SLOTS = { + '360p.mp4': { + 'width': 640, + 'height': 360, + }, + '480p.mp4': { + 'width': 768, + 'height': 432, + }, + '480p_1mbps.mp4': { + 'width': 852, + 'height': 480, + }, + } + def _real_extract(self, url): video_id = self._match_id(url) @@ -95,6 +110,20 @@ class CrackleIE(InfoExtractor): elif ext == 'mpd': formats.extend(self._extract_mpd_formats( format_url, video_id, mpd_id='dash', fatal=False)) + elif format_url.endswith('.ism/Manifest'): + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + mfs_path = e.get('Type') + mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path) + if not mfs_info: + continue + formats.append({ + 'url': format_url, + 'format_id': 'http-' + mfs_path.split('.')[0], + 'width': mfs_info['width'], + 'height': mfs_info['height'], + }) self._sort_formats(formats) description = media.get('Description') From e1a0628797e2dc7e5e032f853ab4ddc6a7a08020 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 20 Dec 2018 23:22:51 +0100 Subject: [PATCH 157/159] [liveleak] add support for another embed type and restore original format extraction --- youtube_dl/extractor/liveleak.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 26671753c..22a067e40 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -87,7 +87,7 @@ class LiveLeakIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return re.findall( - r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"', + r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"', webpage) def _real_extract(self, url): @@ -120,13 +120,27 @@ class LiveLeakIE(InfoExtractor): } for idx, info_dict in enumerate(entries): + formats = [] for a_format in info_dict['formats']: if not a_format.get('height'): a_format['height'] = int_or_none(self._search_regex( r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None)) + formats.append(a_format) - self._sort_formats(info_dict['formats']) + # Removing '.*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/rg3/youtube-dl/pull/4768) + orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) + if a_format['url'] != orig_url: + format_id = a_format.get('format_id') + formats.append({ + 'format_id': 'original' + ('-' + format_id if format_id else ''), + 'url': orig_url, + 'preference': 1, + }) + self._sort_formats(formats) + info_dict['formats'] = formats # Don't append entry ID for one-video pages to keep backward compatibility if len(entries) > 1: @@ -146,7 +160,7 @@ class LiveLeakIE(InfoExtractor): class LiveLeakEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[if])=(?P[\w_]+)' + _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[ift])=(?P[\w_]+)' # See generic.py for actual test cases _TESTS = [{ @@ -158,15 +172,14 @@ class LiveLeakEmbedIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - kind, video_id = mobj.group('kind', 'id') + kind, video_id = re.match(self._VALID_URL, url).groups() if kind == 'f': webpage = self._download_webpage(url, video_id) liveleak_url = self._search_regex( - r'logourl\s*:\s*(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, + r'(?:logourl\s*:\s*|window\.open\()(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, webpage, 'LiveLeak URL', group='url') - elif kind == 'i': - liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + else: + liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id) return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) From 4273caf5c7e4a79783106938802c1f4e1aa9c950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Dec 2018 16:38:16 +0700 Subject: [PATCH 158/159] [youtube] Extend html5 player regex (closes #17516) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 44c25c11c..906774875 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1834,7 +1834,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version From 63529e935cf5f87e6080607ef9d9196fe435e092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Dec 2018 16:57:10 +0700 Subject: [PATCH 159/159] [youtube] Relax html5 player regexes (closes #18465, closes #18466) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 906774875..954853e00 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1105,7 +1105,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1834,7 +1834,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version