From 29d9594561fd92b07d1c2cff04ae5a4c144946b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 22:11:01 +0700 Subject: [PATCH 001/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0d748316e..d4f442421 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Core ++ [downloader/external] Add elapsed time to progress hook (#10876) +* [downloader/external,fragment] Fix download finalization when writing file + to stdout (#10809, #10876, #15799) + +Extractors +* [vrv] Fix extraction on python2 (#15928) +* [afreecatv] Update referrer (#15947) ++ [24video] Add support for 24video.sexy (#15973) +* [crackle] Bypass geo restriction +* [crackle] Fix extraction (#15969) ++ [lenta] Add support for lenta.ru (#15953) ++ [instagram:user] Add pagination (#15934) +* [youku] Update ccode (#15939) +* [libsyn] Adapt to new page structure + + version 2018.03.20 Core From 671e241bfbf5d1954ff07c98e0ba2c3d7c2405c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 05:03:47 +0700 Subject: [PATCH 002/488] release 2018.03.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 75c5b2226..86912f5e7 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.20*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.20** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.20 +[debug] youtube-dl version 2018.03.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d4f442421..0d43b580f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.03.26 Core + [downloader/external] Add elapsed time to progress hook (#10876) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 80358bb14..0d7d7fbb3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -419,6 +419,7 @@ - **Lecture2Go** - **LEGO** - **Lemonde** + - **Lenta** - **LePlaylist** - **LetvCloud**: 乐视云 - **Libsyn** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c686714f0..d6d87ad74 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.20' +__version__ = '2018.03.26' From c3cfc71a0c822c86a01ad9c150415724d0b2b045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 22:30:11 +0700 Subject: [PATCH 003/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 0d43b580f..d4f442421 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version 2018.03.26 +version Core + [downloader/external] Add elapsed time to progress hook (#10876) From bbd9d8c17075055ddfd9873092a29a3e21566805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 22:32:03 +0700 Subject: [PATCH 004/488] release 2018.03.26.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 86912f5e7..0cd090e40 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.26 +[debug] youtube-dl version 2018.03.26.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d4f442421..f9d04ffd9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.03.26.1 Core + [downloader/external] Add elapsed time to progress hook (#10876) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d6d87ad74..d38fde039 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.26' +__version__ = '2018.03.26.1' From 99c3091850118d08c14c78f5cc6ab5ce73f4196a Mon Sep 17 00:00:00 2001 From: Attila-Mihaly Balazs Date: Tue, 27 Mar 2018 18:02:04 +0300 Subject: [PATCH 005/488] [videa] Extend _VALID_URL --- youtube_dl/extractor/videa.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index 311df58f4..d0e34c819 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -16,7 +16,7 @@ from ..utils import ( class VideaIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - videa\.hu/ + videa(?:kid)?\.hu/ (?: videok/(?:[^/]+/)*[^?#&]+-| player\?.*?\bv=| @@ -31,7 +31,7 @@ class VideaIE(InfoExtractor): 'id': '8YfIAjxwWGwT8HVQ', 'ext': 'mp4', 'title': 'Az őrült kígyász 285 kígyót enged szabadon', - 'thumbnail': 'http://videa.hu/static/still/1.4.1.1007274.1204470.3', + 'thumbnail': r're:^https?://.*', 'duration': 21, }, }, { @@ -43,6 +43,15 @@ class VideaIE(InfoExtractor): }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'only_matching': True, + }, { + 'url': 'https://videakid.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player?v=8YfIAjxwWGwT8HVQ', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', + 'only_matching': True, }] @staticmethod From 9e6a4180158026e78f65563d0586923fef8ccece Mon Sep 17 00:00:00 2001 From: xofe <22776566+xofe@users.noreply.github.com> Date: Tue, 27 Mar 2018 15:08:40 +0000 Subject: [PATCH 006/488] [abc:iview] Unescape title and series meta fields --- youtube_dl/extractor/abc.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 87017ed39..512f04684 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -13,6 +13,7 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + unescapeHTML, update_url_query, ) @@ -109,16 +110,17 @@ class ABCIViewIE(InfoExtractor): # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'http://iview.abc.net.au/programs/call-the-midwife/ZW0898A003S00', + 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZW0898A003S00', + 'id': 'ZY9247A021S00', 'ext': 'mp4', - 'title': 'Series 5 Ep 3', - 'description': 'md5:e0ef7d4f92055b86c4f33611f180ed79', - 'upload_date': '20171228', - 'uploader_id': 'abc1', - 'timestamp': 1514499187, + 'title': "Gaston's Visit", + 'series': "Ben And Holly's Little Kingdom", + 'description': 'md5:18db170ad71cf161e006a4c688e33155', + 'upload_date': '20180318', + 'uploader_id': 'abc4kids', + 'timestamp': 1521400959, }, 'params': { 'skip_download': True, @@ -169,12 +171,12 @@ class ABCIViewIE(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': unescapeHTML(title), 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), - 'series': video_params.get('seriesTitle'), + 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), 'episode': self._html_search_meta('episode_title', webpage, default=None), From 5d60b9971784289acd4325a8ed7b5afd7bea05ca Mon Sep 17 00:00:00 2001 From: "Arend v. Reinersdorff" Date: Tue, 27 Mar 2018 17:25:29 +0200 Subject: [PATCH 007/488] [options] Mention comments support in --batch-file --- youtube_dl/options.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 7d1bbc021..3e4ac03a2 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -676,7 +676,8 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '-a', '--batch-file', dest='batchfile', metavar='FILE', - help='File containing URLs to download (\'-\' for stdin)') + help="File containing URLs to download ('-' for stdin), one URL per line. " + "Lines starting with '#', ';' or ']' are considered as comments and ignored.") filesystem.add_option( '--id', default=False, action='store_true', dest='useid', help='Use only video ID in file name') From 02f6ccbce3a50d8db3eac06a5820347cf674ca86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 29 Mar 2018 23:06:13 +0700 Subject: [PATCH 008/488] [dramafever] Partially switch to API v5 (closes #16026) --- youtube_dl/extractor/dramafever.py | 156 +++++++++++++++++++---------- 1 file changed, 102 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 6b60e542b..c7a048f9d 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,25 +3,26 @@ from __future__ import unicode_literals import itertools -from .amp import AMPIE +from .common import InfoExtractor from ..compat import ( - compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( - ExtractorError, clean_html, + ExtractorError, int_or_none, - remove_end, + parse_age_limit, + parse_duration, sanitized_Request, + unified_timestamp, urlencode_postdata ) -class DramaFeverBaseIE(AMPIE): +class DramaFeverBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' - _GEO_COUNTRIES = ['US', 'CA'] _CONSUMER_SECRET = 'DA59dtVXYLxajktV' @@ -70,18 +71,20 @@ class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/(?:[^/]+/)?drama/(?P[0-9]+/[0-9]+)(?:/|$)' _TESTS = [{ - 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', + 'url': 'https://www.dramafever.com/drama/4274/1/Heirs/', 'info_dict': { - 'id': '4512.1', - 'ext': 'flv', - 'title': 'Cooking with Shin', - 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'id': '4274.1', + 'ext': 'wvm', + 'title': 'Heirs - Episode 1', + 'description': 'md5:362a24ba18209f6276e032a651c50bc2', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3783, + 'timestamp': 1381354993, + 'upload_date': '20131009', + 'series': 'Heirs', + 'season_number': 1, 'episode': 'Episode 1', 'episode_number': 1, - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1404336058, - 'upload_date': '20140702', - 'duration': 344, }, 'params': { # m3u8 download @@ -110,50 +113,95 @@ class DramaFeverIE(DramaFeverBaseIE): 'only_matching': True, }] + def _call_api(self, path, video_id, note, fatal=False): + return self._download_json( + 'https://www.dramafever.com/api/5/' + path, + video_id, note=note, headers={ + 'x-consumer-key': self._consumer_secret, + }, fatal=fatal) + + def _get_subtitles(self, video_id): + subtitles = {} + subs = self._call_api( + 'video/%s/subtitles/webvtt/' % video_id, video_id, + 'Downloading subtitles JSON', fatal=False) + if not subs or not isinstance(subs, list): + return subtitles + for sub in subs: + if not isinstance(sub, dict): + continue + sub_url = sub.get('url') + if not sub_url or not isinstance(sub_url, compat_str): + continue + subtitles.setdefault( + sub.get('code') or sub.get('language') or 'en', []).append({ + 'url': sub_url + }) + return subtitles + def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') - try: - info = self._extract_feed_info( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - self.raise_geo_restricted( - msg='Currently unavailable in your country', - countries=self._GEO_COUNTRIES) - raise - - # title is postfixed with video id for some reason, removing - if info.get('title'): - info['title'] = remove_end(info['title'], video_id).strip() - series_id, episode_number = video_id.split('.') - episode_info = self._download_json( - # We only need a single episode info, so restricting page size to one episode - # and dealing with page number as with episode number - r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1' - % (self._consumer_secret, series_id, episode_number), - video_id, 'Downloading episode info JSON', fatal=False) - if episode_info: - value = episode_info.get('value') - if isinstance(value, list): - for v in value: - if v.get('type') == 'Episode': - subfile = v.get('subfile') or v.get('new_subfile') - if subfile and subfile != 'http://www.dramafever.com/st/': - info.setdefault('subtitles', {}).setdefault('English', []).append({ - 'ext': 'srt', - 'url': subfile, - }) - episode_number = int_or_none(v.get('number')) - episode_fallback = 'Episode' - if episode_number: - episode_fallback += ' %d' % episode_number - info['episode'] = v.get('title') or episode_fallback - info['episode_number'] = episode_number - break - return info + video = self._call_api( + 'series/%s/episodes/%s/' % (series_id, episode_number), video_id, + 'Downloading video JSON') + + formats = [] + download_assets = video.get('download_assets') + if download_assets and isinstance(download_assets, dict): + for format_id, format_dict in download_assets.items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url or not isinstance(format_url, compat_str): + continue + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'filesize': int_or_none(video.get('filesize')), + }) + + stream = self._call_api( + 'video/%s/stream/' % video_id, video_id, 'Downloading stream JSON', + fatal=False) + if stream: + stream_url = stream.get('stream_url') + if stream_url: + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + title = video.get('title') or 'Episode %s' % episode_number + description = video.get('description') + thumbnail = video.get('thumbnail') + timestamp = unified_timestamp(video.get('release_date')) + duration = parse_duration(video.get('duration')) + age_limit = parse_age_limit(video.get('tv_rating')) + series = video.get('series_title') + season_number = int_or_none(video.get('season')) + + if series: + title = '%s - %s' % (series, title) + + subtitles = self.extract_subtitles(video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'series': series, + 'season_number': season_number, + 'episode_number': int_or_none(episode_number), + 'formats': formats, + 'subtitles': subtitles, + } class DramaFeverSeriesIE(DramaFeverBaseIE): From 190f6c936be0ec03ed999cbf34e73f38c9beb022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 29 Mar 2018 23:49:09 +0700 Subject: [PATCH 009/488] [naver] Fix extraction (closes #16029) --- youtube_dl/extractor/naver.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 2047d4402..bb3d94413 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -43,9 +41,14 @@ class NaverIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m_id = re.search(r'var rmcPlayer = new nhn\.rmcnmv\.RMCVideoPlayer\("(.+?)", "(.+?)"', - webpage) - if m_id is None: + vid = self._search_regex( + r'videoId["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'video id', fatal=None, group='value') + in_key = self._search_regex( + r'inKey["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'key', default=None, group='value') + + if not vid or not in_key: error = self._html_search_regex( r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', webpage, 'error', default=None) @@ -53,9 +56,9 @@ class NaverIE(InfoExtractor): raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') video_data = self._download_json( - 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), + 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, video_id, query={ - 'key': m_id.group(2), + 'key': in_key, }) meta = video_data['meta'] title = meta['subject'] From 3e78d23b5783d01f60bcb515febd5a590a734ee4 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Fri, 30 Mar 2018 18:25:43 +0200 Subject: [PATCH 010/488] [openload] Add support for oload.site --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index eaaaf8a08..af7db6e12 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', From 0b4bbcdcb6f62e080e70c026eb28a5e92f46dfc8 Mon Sep 17 00:00:00 2001 From: kenavera Date: Sat, 31 Mar 2018 17:14:49 +0200 Subject: [PATCH 011/488] [medialaan] Fix vod id --- youtube_dl/extractor/medialaan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index f8c30052f..50d5db802 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -141,6 +141,7 @@ class MedialaanIE(GigyaBaseIE): vod_id = config.get('vodId') or self._search_regex( (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', + r'"vodId"\s*:\s*"(.+?)"', r'<[^>]+id=["\']vod-(\d+)'), webpage, 'video_id', default=None) From 0669f8fd8f19fbe0783974654fc2a6925d6162b0 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 31 Mar 2018 11:46:08 -0500 Subject: [PATCH 012/488] [xvideos] Fix thumbnail extraction (closes #15978) --- youtube_dl/extractor/xvideos.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 085c8d4f3..efee95651 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -58,7 +58,9 @@ class XVideosIE(InfoExtractor): group='title') or self._og_search_title(webpage) thumbnail = self._search_regex( - r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + (r'setThumbUrl\(\s*(["\'])(?P(?:(?!\1).)+)\1', + r'url_bigthumb=(?P.+?)&'), + webpage, 'thumbnail', fatal=False, group='thumbnail') duration = int_or_none(self._og_search_property( 'duration', webpage, default=None)) or parse_duration( self._search_regex( From 95a1322bc10687efac0b00fb3fd55708e556baf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Apr 2018 02:06:14 +0700 Subject: [PATCH 013/488] [bilibili] Remove debug from player params regexes --- youtube_dl/extractor/bilibili.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 90697c4a7..3e3348ef5 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -117,9 +117,9 @@ class BiliBiliIE(InfoExtractor): r'cid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( - [r'1EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r'1EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', - r'1]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', + r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters'))['cid'][0] else: if 'no_bangumi_tip' not in smuggled_data: From 03fcde10ced29291268f39cb8ccf7ee5dd40f676 Mon Sep 17 00:00:00 2001 From: kenavera Date: Sun, 1 Apr 2018 16:22:51 +0200 Subject: [PATCH 014/488] [nationalgeographic] Add support for new URL schema (closes #16001) --- youtube_dl/extractor/nationalgeographic.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 246f6795a..4d2ee6408 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -68,11 +68,11 @@ class NationalGeographicVideoIE(InfoExtractor): class NationalGeographicIE(ThePlatformIE, AdobePassIE): IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:wild/)?[^/]+/)?(?:videos|episodes)/(?P[^/?]+)' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:(?:wild/)?[^/]+/)?(?:videos|episodes)|u)/(?P[^/?]+)' _TESTS = [ { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'url': 'http://channel.nationalgeographic.com/u/kdi9Ld0PN2molUUIMSBGxoeDhD729KRjQcnxtetilWPMevo8ZwUBIDuPR0Q3D2LVaTsk0MPRkRWDB8ZhqWVeyoxfsZZm36yRp1j-zPfsHEyI_EgAeFY/', 'md5': '518c9aa655686cf81493af5cc21e2a04', 'info_dict': { 'id': 'vKInpacll2pC', @@ -86,7 +86,7 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): 'add_ie': ['ThePlatform'], }, { - 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'url': 'http://channel.nationalgeographic.com/u/kdvOstqYaBY-vSBPyYgAZRUL4sWUJ5XUUPEhc7ISyBHqoIO4_dzfY3K6EjHIC0hmFXoQ7Cpzm6RkET7S3oMlm6CFnrQwSUwo/', 'md5': 'c4912f656b4cbe58f3e000c489360989', 'info_dict': { 'id': 'Pok5lWCkiEFA', @@ -106,6 +106,14 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): { 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'only_matching': True, } ] From e51762be19289da50977fd6f2d0ee2a1722765a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Apr 2018 22:47:39 +0700 Subject: [PATCH 015/488] [afreecatv] Add support for authentication (#14450) --- youtube_dl/extractor/afreecatv.py | 47 +++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 0f4535804..bb3728bb0 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + urlencode_postdata, xpath_text, ) @@ -28,6 +29,7 @@ class AfreecaTVIE(InfoExtractor): ) (?P\d+) ''' + _NETRC_MACHINE = 'afreecatv' _TESTS = [{ 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', @@ -172,6 +174,51 @@ class AfreecaTVIE(InfoExtractor): video_key['part'] = int(m.group('part')) return video_key + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_form = { + 'szWork': 'login', + 'szType': 'json', + 'szUid': username, + 'szPassword': password, + 'isSaveId': 'false', + 'szScriptVar': 'oLoginRet', + 'szAction': '', + } + + response = self._download_json( + 'https://login.afreecatv.com/app/LoginAction.php', None, + 'Logging in', data=urlencode_postdata(login_form)) + + _ERRORS = { + -4: 'Your account has been suspended due to a violation of our terms and policies.', + -5: 'https://member.afreecatv.com/app/user_delete_progress.php', + -6: 'https://login.afreecatv.com/membership/changeMember.php', + -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.afreecatv.com/app/pop_login_block.php', + -11: 'https://login.afreecatv.com/afreeca/second_login.php', + -12: 'https://member.afreecatv.com/app/user_security.php', + 0: 'The username does not exist or you have entered the wrong password.', + -1: 'The username does not exist or you have entered the wrong password.', + -3: 'You have entered your username/password incorrectly.', + -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', + -32008: 'You have failed to log in. Please contact our Help Center.', + } + + result = int_or_none(response.get('RESULT')) + if result != 1: + error = _ERRORS.get(result, 'You have failed to log in.') + raise ExtractorError( + 'Unable to login: %s said: %s' % (self.IE_NAME, error), + expected=True) + def _real_extract(self, url): video_id = self._match_id(url) From d563fb32ba5ef4b1a8061fca27edf3b1ad7eb8fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Apr 2018 23:07:54 +0700 Subject: [PATCH 016/488] [afreecatv] Remove debug output --- youtube_dl/extractor/afreecatv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index bb3728bb0..095e6204f 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -234,7 +234,7 @@ class AfreecaTVIE(InfoExtractor): r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') video_id = self._search_regex( r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - print(video_id, station_id, bbs_id) + video_xml = self._download_xml( 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', video_id, headers={ From 86693c4930b98e8df33736d87361400422b1adab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 Apr 2018 00:00:45 +0700 Subject: [PATCH 017/488] [afreecatv] Use partial view only when necessary (closes #14450) --- youtube_dl/extractor/afreecatv.py | 56 +++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 095e6204f..4b3d97136 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -141,22 +141,22 @@ class AfreecaTVIE(InfoExtractor): 'skip_download': True, }, }, { - # adult video - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/26542731', + # PARTIAL_ADULT + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', 'info_dict': { - 'id': '20171001_F1AE1711_196617479_1', + 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', - 'title': '[생]서아 초심 찾기 방송 (part 1)', + 'title': '[생]빨개요♥ (part 1)', 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'BJ서아', + 'uploader': '[SA]서아', 'uploader_id': 'bjdyrksu', - 'upload_date': '20171001', - 'duration': 3600, - 'age_limit': 18, + 'upload_date': '20180327', + 'duration': 3601, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['adult content'], }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, @@ -235,21 +235,41 @@ class AfreecaTVIE(InfoExtractor): video_id = self._search_regex( r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - video_xml = self._download_xml( - 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', - video_id, headers={ - 'Referer': url, - }, query={ + partial_view = False + for _ in range(2): + query = { 'nTitleNo': video_id, 'nStationNo': station_id, 'nBbsNo': bbs_id, - 'partialView': 'SKIP_ADULT', - }) + } + if partial_view: + query['partialView'] = 'SKIP_ADULT' + video_xml = self._download_xml( + 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', + video_id, 'Downloading video info XML%s' + % (' (skipping adult)' if partial_view else ''), + video_id, headers={ + 'Referer': url, + }, query=query) - flag = xpath_text(video_xml, './track/flag', 'flag', default=None) - if flag and flag != 'SUCCEED': + flag = xpath_text(video_xml, './track/flag', 'flag', default=None) + if flag and flag == 'SUCCEED': + break + if flag == 'PARTIAL_ADULT': + self._downloader.report_warning( + 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' + 'Only content suitable for all ages will be downloaded. ' + 'Provide account credentials if you wish to download restricted content.') + partial_view = True + continue + elif flag == 'ADULT': + error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' + else: + error = flag raise ExtractorError( - '%s said: %s' % (self.IE_NAME, flag), expected=True) + '%s said: %s' % (self.IE_NAME, error), expected=True) + else: + raise ExtractorError('Unable to download video info') video_element = video_xml.findall(compat_xpath('./track/video'))[-1] if video_element is None or video_element.text is None: From 8bd1df3c316970f15662831c28311560884356a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 Apr 2018 22:19:42 +0700 Subject: [PATCH 018/488] [dramafever] Fix authentication (closes #16067) --- youtube_dl/extractor/dramafever.py | 41 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index c7a048f9d..ffbd2623d 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -2,9 +2,11 @@ from __future__ import unicode_literals import itertools +import json from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_str, compat_urlparse, ) @@ -14,14 +16,11 @@ from ..utils import ( int_or_none, parse_age_limit, parse_duration, - sanitized_Request, unified_timestamp, - urlencode_postdata ) class DramaFeverBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' _CONSUMER_SECRET = 'DA59dtVXYLxajktV' @@ -39,8 +38,8 @@ class DramaFeverBaseIE(InfoExtractor): 'consumer secret', default=self._CONSUMER_SECRET) def _real_initialize(self): - self._login() self._consumer_secret = self._get_consumer_secret() + self._login() def _login(self): (username, password) = self._get_login_info() @@ -52,19 +51,29 @@ class DramaFeverBaseIE(InfoExtractor): 'password': password, } - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - response = self._download_webpage( - request, None, 'Logging in') + try: + response = self._download_json( + 'https://www.dramafever.com/api/users/login', None, 'Logging in', + data=json.dumps(login_form).encode('utf-8'), headers={ + 'x-consumer-key': self._consumer_secret, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (403, 404): + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + else: + raise - if all(logout_pattern not in response - for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): - error = self._html_search_regex( - r'(?s)]+\bclass="hidden-xs prompt"[^>]*>(.+?) Date: Mon, 12 Mar 2018 08:57:41 +0100 Subject: [PATCH 019/488] [tvnow] Add support for shows --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tvnow.py | 73 +++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de48a37ad..e3a67cc5b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1136,6 +1136,7 @@ from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, TVNowListIE, + TVNowListChannelIE, ) from .tvp import ( TVPEmbedIE, diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 1bf472444..8e0ac6be5 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -19,7 +19,7 @@ class TVNowBaseIE(InfoExtractor): 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'format.defaultImage169Logo', 'replaceMovieInformation') def _call_api(self, path, video_id, query): return self._download_json( @@ -58,7 +58,7 @@ class TVNowBaseIE(InfoExtractor): duration = parse_duration(info.get('duration')) f = info.get('format', {}) - thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + thumbnail = ('https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % info.get('replaceMovieInformation')) or f.get('defaultImage169Format') or f.get('defaultImage169Logo') return { 'id': video_id, @@ -133,7 +133,27 @@ class TVNowIE(TVNowBaseIE): return self._extract_video(info, display_id) -class TVNowListIE(TVNowBaseIE): +class TVNowListBaseIE(TVNowBaseIE): + def _extend_query(self, show, season, video=None): + fields = [] + fields.extend(show) + fields.extend('formatTabs.%s' % field for field in season) + if video: + fields.extend( + 'formatTabs.formatTabPages.container.movies.%s' % field + for field in video) + + return fields + + def _tvnow_list_info(self, list_id, show_id, fields): + return self._call_api( + 'formats/seo', list_id, query={ + 'fields': ','.join(fields), + 'name': show_id + '.php' + }) + + +class TVNowListIE(TVNowListBaseIE): _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/)list/(?P[^?/#&]+)$' _SHOW_FIELDS = ('title', ) @@ -152,18 +172,7 @@ class TVNowListIE(TVNowBaseIE): def _real_extract(self, url): base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() - fields = [] - fields.extend(self._SHOW_FIELDS) - fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in self._VIDEO_FIELDS) - - list_info = self._call_api( - 'formats/seo', season_id, query={ - 'fields': ','.join(fields), - 'name': show_id + '.php' - }) + list_info = self._tvnow_list_info(season_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS, self._VIDEO_FIELDS)) season = next( season for season in list_info['formatTabs']['items'] @@ -177,8 +186,40 @@ class TVNowListIE(TVNowBaseIE): seo_url = info.get('seoUrl') if not seo_url: continue + entries.append(self.url_result( - base_url + seo_url + '/player', 'TVNow', info.get('id'))) + base_url + seo_url + '/player', 'TVNow', str(info.get('id', seo_url)))) return self.playlist_result( entries, compat_str(season.get('id') or season_id), title) + + +class TVNowListChannelIE(TVNowListBaseIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+))' + + _SHOW_FIELDS = ('id', 'title', ) + _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) + + _TESTS = [{ + 'url': 'https://www.tvnow.at/vox/ab-ins-beet', + 'only_matching': 'True', + }] + + @classmethod + def suitable(cls, url): + return False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) else super(TVNowListChannelIE, cls).suitable(url) + + def _real_extract(self, url): + base_url, show_id = re.match(self._VALID_URL, url).groups() + + list_info = self._tvnow_list_info(show_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS)) + + entries = [] + for season_info in list_info['formatTabs']['items']: + season_url = season_info.get('seoheadline') + if not season_url: + continue + entries.append(self.url_result( + base_url + "/list/" + season_url, 'TVNowList', compat_str(season_info.get('id')), season_info.get('headline'))) + + return self.playlist_result(entries) From ea6679fbeb1fb91131022886a0a8697e4c75f07f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Apr 2018 00:08:22 +0700 Subject: [PATCH 020/488] [tvnow] Fix issues, simplify and improve (closes #15837) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tvnow.py | 124 +++++++++++++++++++++-------- 2 files changed, 90 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e3a67cc5b..bded6e144 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1136,7 +1136,7 @@ from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, TVNowListIE, - TVNowListChannelIE, + TVNowShowIE, ) from .tvp import ( TVPEmbedIE, diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 8e0ac6be5..808571ece 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, parse_iso8601, parse_duration, + try_get, update_url_query, ) @@ -19,7 +20,7 @@ class TVNowBaseIE(InfoExtractor): 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo', 'replaceMovieInformation') + 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -58,14 +59,22 @@ class TVNowBaseIE(InfoExtractor): duration = parse_duration(info.get('duration')) f = info.get('format', {}) - thumbnail = ('https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % info.get('replaceMovieInformation')) or f.get('defaultImage169Format') or f.get('defaultImage169Logo') + + thumbnails = [{ + 'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id, + }] + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'timestamp': timestamp, 'duration': duration, 'series': f.get('title'), @@ -77,7 +86,12 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): - _VALID_URL = r'https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P[^/]+)/(?:player|preview)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?P[^/]+)/ + (?!(?:list|jahr)(?:/|$))(?P[^/?\#&]+) + ''' _TESTS = [{ 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', @@ -99,27 +113,30 @@ class TVNowIE(TVNowBaseIE): }, { # rtl2 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', - 'only_matching': 'True', + 'only_matching': True, }, { # rtlnitro 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', - 'only_matching': 'True', + 'only_matching': True, }, { # superrtl 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', - 'only_matching': 'True', + 'only_matching': True, }, { # ntv 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', - 'only_matching': 'True', + 'only_matching': True, }, { # vox 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', - 'only_matching': 'True', + 'only_matching': True, }, { # rtlplus 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', - 'only_matching': 'True', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'only_matching': True, }] def _real_extract(self, url): @@ -134,27 +151,29 @@ class TVNowIE(TVNowBaseIE): class TVNowListBaseIE(TVNowBaseIE): - def _extend_query(self, show, season, video=None): - fields = [] - fields.extend(show) - fields.extend('formatTabs.%s' % field for field in season) - if video: - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in video) + _SHOW_VALID_URL = r'''(?x) + (?P + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?P[^/]+) + ) + ''' - return fields - - def _tvnow_list_info(self, list_id, show_id, fields): + def _extract_list_info(self, display_id, show_id): + fields = list(self._SHOW_FIELDS) + fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) + fields.extend( + 'formatTabs.formatTabPages.container.movies.%s' % field + for field in self._VIDEO_FIELDS) return self._call_api( - 'formats/seo', list_id, query={ + 'formats/seo', display_id, query={ 'fields': ','.join(fields), 'name': show_id + '.php' }) class TVNowListIE(TVNowListBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/)list/(?P[^?/#&]+)$' + _VALID_URL = r'%s/(?:list|jahr)/(?P[^?\#&]+)' % TVNowListBaseIE._SHOW_VALID_URL _SHOW_FIELDS = ('title', ) _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) @@ -167,59 +186,94 @@ class TVNowListIE(TVNowListBaseIE): 'title': '30 Minuten Deutschland - Aktuell', }, 'playlist_mincount': 1, + }, { + 'url': 'https://www.tvnow.de/vox/ab-ins-beet/list/staffel-14', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/2018/3', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if TVNowIE.suitable(url) + else super(TVNowListIE, cls).suitable(url)) + def _real_extract(self, url): base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() - list_info = self._tvnow_list_info(season_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS, self._VIDEO_FIELDS)) + list_info = self._extract_list_info(season_id, show_id) season = next( season for season in list_info['formatTabs']['items'] if season.get('seoheadline') == season_id) - title = '%s - %s' % (list_info['title'], season['headline']) + title = list_info.get('title') + headline = season.get('headline') + if title and headline: + title = '%s - %s' % (title, headline) + else: + title = headline or title entries = [] for container in season['formatTabPages']['items']: - for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: + items = try_get( + container, lambda x: x['container']['movies']['items'], + list) or [] + for info in items: seo_url = info.get('seoUrl') if not seo_url: continue - + video_id = info.get('id') entries.append(self.url_result( - base_url + seo_url + '/player', 'TVNow', str(info.get('id', seo_url)))) + '%s/%s/player' % (base_url, seo_url), TVNowIE.ie_key(), + compat_str(video_id) if video_id else None)) return self.playlist_result( entries, compat_str(season.get('id') or season_id), title) -class TVNowListChannelIE(TVNowListBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+))' +class TVNowShowIE(TVNowListBaseIE): + _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL _SHOW_FIELDS = ('id', 'title', ) _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) + _VIDEO_FIELDS = () _TESTS = [{ 'url': 'https://www.tvnow.at/vox/ab-ins-beet', - 'only_matching': 'True', + 'info_dict': { + 'id': 'ab-ins-beet', + 'title': 'Ab ins Beet!', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.tvnow.at/vox/ab-ins-beet/list', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) else super(TVNowListChannelIE, cls).suitable(url) + return (False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) + else super(TVNowShowIE, cls).suitable(url)) def _real_extract(self, url): base_url, show_id = re.match(self._VALID_URL, url).groups() - list_info = self._tvnow_list_info(show_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS)) + list_info = self._extract_list_info(show_id, show_id) entries = [] for season_info in list_info['formatTabs']['items']: season_url = season_info.get('seoheadline') if not season_url: continue + season_id = season_info.get('id') entries.append(self.url_result( - base_url + "/list/" + season_url, 'TVNowList', compat_str(season_info.get('id')), season_info.get('headline'))) + '%s/list/%s' % (base_url, season_url), TVNowListIE.ie_key(), + compat_str(season_id) if season_id else None, + season_info.get('headline'))) - return self.playlist_result(entries) + return self.playlist_result(entries, show_id, list_info.get('title')) From 10f9caec048ca0c7c85a568d1dab12d7d7f7b45d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Apr 2018 00:23:03 +0700 Subject: [PATCH 021/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index f9d04ffd9..89c58aba2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version + +Extractors ++ [tvnow] Add support for shows (#15837) +* [dramafever] Fix authentication (#16067) +* [afreecatv] Use partial view only when necessary (#14450) ++ [afreecatv] Add support for authentication (#14450) ++ [nationalgeographic] Add support for new URL schema (#16001, #16054) +* [xvideos] Fix thumbnail extraction (#15978, #15979) +* [medialaan] Fix vod id (#16038) ++ [openload] Add support for oload.site (#16039) +* [naver] Fix extraction (#16029) +* [dramafever] Partially switch to API v5 (#16026) +* [abc:iview] Unescape title and series meta fields (#15994) +* [videa] Extend URL regular expression (#16003) + + version 2018.03.26.1 Core From e8dfecb3842ba54a4260af81e859e487e36eba41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Apr 2018 00:26:11 +0700 Subject: [PATCH 022/488] release 2018.04.03 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 4 +++- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 0cd090e40..99e8acd33 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.26.1 +[debug] youtube-dl version 2018.04.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 89c58aba2..89dfbd8b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.04.03 Extractors + [tvnow] Add support for shows (#15837) diff --git a/README.md b/README.md index 7dba5775d..5af0f387b 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Filesystem Options: -a, --batch-file FILE File containing URLs to download ('-' for - stdin) + stdin), one URL per line. Lines starting + with '#', ';' or ']' are considered as + comments and ignored. --id Use only video ID in file name -o, --output TEMPLATE Output filename template, see the "OUTPUT TEMPLATE" for all the info diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0d7d7fbb3..17baac5ab 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -887,6 +887,7 @@ - **TVNoe** - **TVNow** - **TVNowList** + - **TVNowShow** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d38fde039..a3163509c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.26.1' +__version__ = '2018.04.03' From fd97fa7bfc59983d315892c26f861842820a9579 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Fri, 30 Mar 2018 20:02:09 +0200 Subject: [PATCH 023/488] [svtplay:series] Add extractor Related to #11130 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/svt.py | 57 ++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bded6e144..b46a304ac 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1031,6 +1031,7 @@ from .sunporno import SunPornoIE from .svt import ( SVTIE, SVTPlayIE, + SVTPlaylistIE, ) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 48bc4529e..d02fd9450 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -9,6 +9,8 @@ from ..utils import ( dict_get, int_or_none, try_get, + urljoin, + compat_str, ) @@ -189,3 +191,58 @@ class SVTPlayIE(SVTBaseIE): r'\s*\|\s*.+?$', '', info_dict.get('episode') or self._og_search_title(webpage)) return info_dict + + +class SVTPlaylistIE(InfoExtractor): + IE_DESC = 'SVT Play serie' + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' + IE_NAME = 'svtplay:serie' + _TESTS = [{ + 'url': 'https://www.svtplay.se/rederiet', + 'info_dict': { + 'id': 'rederiet', + 'title': 'Rederiet', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_mincount': 318, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + page = self._download_webpage( + url, video_id, + note='Downloading serie page', + errnote='unable to fetch serie page') + + root_json = self._search_regex( + r'root\[\'__svtplay\'\]\s*=(.+);\n', + page, 'root') + root = self._parse_json(root_json, video_id) + + metadata = root.get('metaData', {}) + related_videos_accordion = root['relatedVideoContent']['relatedVideosAccordion'] + + entries = [] + for season in related_videos_accordion: + videos = season.get('videos') + if not isinstance(videos, list): + continue + + for video in videos: + content_url = video.get('contentUrl') + if not isinstance(content_url, compat_str): + continue + entries.append( + self.url_result( + urljoin(url, content_url), + ie=SVTPlayIE.ie_key(), + video_title=video.get('title') + )) + + return self.playlist_result( + entries, video_id, metadata.get('title'), metadata.get('description')) From b71bb3ba8be711abab4c05527d28c4b5e4552401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 Apr 2018 23:52:00 +0700 Subject: [PATCH 024/488] [svtplay:series] Improve extraction (closes #16059) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/svt.py | 36 ++++++++++++++---------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b46a304ac..c9f60114d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1031,7 +1031,7 @@ from .sunporno import SunPornoIE from .svt import ( SVTIE, SVTPlayIE, - SVTPlaylistIE, + SVTSeriesIE, ) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index d02fd9450..45b4b8bf7 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -193,10 +193,8 @@ class SVTPlayIE(SVTBaseIE): return info_dict -class SVTPlaylistIE(InfoExtractor): - IE_DESC = 'SVT Play serie' +class SVTSeriesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' - IE_NAME = 'svtplay:serie' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', 'info_dict': { @@ -209,33 +207,28 @@ class SVTPlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPlaylistIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage( - url, video_id, - note='Downloading serie page', - errnote='unable to fetch serie page') + webpage = self._download_webpage( + url, video_id, 'Downloading serie page') - root_json = self._search_regex( - r'root\[\'__svtplay\'\]\s*=(.+);\n', - page, 'root') - root = self._parse_json(root_json, video_id) - - metadata = root.get('metaData', {}) - related_videos_accordion = root['relatedVideoContent']['relatedVideosAccordion'] + root = self._parse_json( + self._search_regex( + r'root\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n', + webpage, 'content', group='json'), + video_id) entries = [] - for season in related_videos_accordion: + for season in root['relatedVideoContent']['relatedVideosAccordion']: videos = season.get('videos') if not isinstance(videos, list): continue - for video in videos: content_url = video.get('contentUrl') - if not isinstance(content_url, compat_str): + if not content_url or not isinstance(content_url, compat_str): continue entries.append( self.url_result( @@ -244,5 +237,10 @@ class SVTPlaylistIE(InfoExtractor): video_title=video.get('title') )) + metadata = root.get('metaData') + if not isinstance(metadata, dict): + metadata = {} + return self.playlist_result( - entries, video_id, metadata.get('title'), metadata.get('description')) + entries, video_id, metadata.get('title'), + metadata.get('description')) From df146eb2827a97da507833c08169d84d708dfb02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 Apr 2018 00:05:09 +0700 Subject: [PATCH 025/488] [svtplay:series] Add support for season URLs --- youtube_dl/extractor/svt.py | 43 ++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 45b4b8bf7..d1d601b1f 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, dict_get, @@ -203,6 +207,14 @@ class SVTSeriesIE(InfoExtractor): 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', }, 'playlist_mincount': 318, + }, { + 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', + 'info_dict': { + 'id': 'rederiet-sasong2', + 'title': 'Rederiet - Säsong 2', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_count': 12, }] @classmethod @@ -210,19 +222,33 @@ class SVTSeriesIE(InfoExtractor): return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): - video_id = self._match_id(url) + series_id = self._match_id(url) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + season_slug = qs.get('tab', [None])[0] + + if season_slug: + series_id += '-%s' % season_slug webpage = self._download_webpage( - url, video_id, 'Downloading serie page') + url, series_id, 'Downloading series page') root = self._parse_json( self._search_regex( r'root\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n', webpage, 'content', group='json'), - video_id) + series_id) + + season_name = None entries = [] for season in root['relatedVideoContent']['relatedVideosAccordion']: + if not isinstance(season, dict): + continue + if season_slug: + if season.get('slug') != season_slug: + continue + season_name = season.get('name') videos = season.get('videos') if not isinstance(videos, list): continue @@ -241,6 +267,13 @@ class SVTSeriesIE(InfoExtractor): if not isinstance(metadata, dict): metadata = {} + title = metadata.get('title') + season_name = season_name or season_slug + + if title and season_name: + title = '%s - %s' % (title, season_name) + elif season_slug: + title = season_slug + return self.playlist_result( - entries, video_id, metadata.get('title'), - metadata.get('description')) + entries, series_id, title, metadata.get('description')) From 1236ac6b0bc5ef49e4065ddfc310d15651633093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 Apr 2018 00:28:36 +0700 Subject: [PATCH 026/488] [svtplay] Share svtplay regex --- youtube_dl/extractor/svt.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index d1d601b1f..b544da414 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -128,7 +128,11 @@ class SVTIE(SVTBaseIE): return info_dict -class SVTPlayIE(SVTBaseIE): +class SVTPlayBaseIE(SVTBaseIE): + _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n' + + +class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P[0-9]+)' _TESTS = [{ @@ -163,8 +167,8 @@ class SVTPlayIE(SVTBaseIE): data = self._parse_json( self._search_regex( - r'root\["__svtplay"\]\s*=\s*([^;]+);', - webpage, 'embedded data', default='{}'), + self._SVTPLAY_RE, webpage, 'embedded data', default='{}', + group='json'), video_id, fatal=False) thumbnail = self._og_search_thumbnail(webpage) @@ -197,7 +201,7 @@ class SVTPlayIE(SVTBaseIE): return info_dict -class SVTSeriesIE(InfoExtractor): +class SVTSeriesIE(SVTPlayBaseIE): _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', @@ -235,8 +239,7 @@ class SVTSeriesIE(InfoExtractor): root = self._parse_json( self._search_regex( - r'root\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n', - webpage, 'content', group='json'), + self._SVTPLAY_RE, webpage, 'content', group='json'), series_id) season_name = None From 235d828b7b113f22309a0f30048678baea210620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 Apr 2018 23:49:15 +0700 Subject: [PATCH 027/488] [openload] Fix extraction (closes #16099) --- youtube_dl/extractor/openload.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index af7db6e12..3e0a7a9a2 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -334,10 +334,11 @@ class OpenloadIE(InfoExtractor): decoded_id = (get_element_by_id('streamurl', webpage) or get_element_by_id('streamuri', webpage) or - get_element_by_id('streamurj', webpage)) - - if not decoded_id: - raise ExtractorError('Can\'t find stream URL', video_id=video_id) + get_element_by_id('streamurj', webpage) or + self._search_regex( + (r'>\s*([\da-zA-Z]+~\d{10,}~\d+\.\d+\.0\.0~[\da-zA-Z]+)\s*<', + r'>\s*([\w~]+~\d+\.\d+\.\d+\.\d+~[\w~]+)'), webpage, + 'stream URL')) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From fdfb32a0dd80de4be67b0fcf93764bfa2a4ce7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 Apr 2018 00:15:22 +0700 Subject: [PATCH 028/488] [openload] Relax stream URL regex --- youtube_dl/extractor/openload.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 3e0a7a9a2..9f5bebe40 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -336,8 +336,8 @@ class OpenloadIE(InfoExtractor): get_element_by_id('streamuri', webpage) or get_element_by_id('streamurj', webpage) or self._search_regex( - (r'>\s*([\da-zA-Z]+~\d{10,}~\d+\.\d+\.0\.0~[\da-zA-Z]+)\s*<', - r'>\s*([\w~]+~\d+\.\d+\.\d+\.\d+~[\w~]+)'), webpage, + (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', + r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)'), webpage, 'stream URL')) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From e944737c597a2f5e8e6ade93a25fc812119d4eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 Apr 2018 23:40:15 +0700 Subject: [PATCH 029/488] [openload] Add support for oload.xyz --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 9f5bebe40..650f95656 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -298,6 +298,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.stream/f/KnG-kKZdcfY', 'only_matching': True, + }, { + 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From e2750e1437497925c5a058947b850ddadd7ee7d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Apr 2018 20:55:01 +0700 Subject: [PATCH 030/488] [liveleak] Extend _VALID_URL (closes #16117) --- youtube_dl/extractor/liveleak.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 246aac576..26671753c 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -7,7 +7,7 @@ from ..utils import int_or_none class LiveLeakIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P[\w_]+)(?:.*)' + _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P[\w_]+)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', 'md5': '0813c2430bea7a46bf13acf3406992f4', @@ -79,6 +79,9 @@ class LiveLeakIE(InfoExtractor): 'title': 'Fuel Depot in China Explosion caught on video', }, 'playlist_count': 3, + }, { + 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', + 'only_matching': True, }] @staticmethod From 9d15be3a5b1f0764d8493ccfc312fc0d0a2df164 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 7 Apr 2018 09:39:21 -0500 Subject: [PATCH 031/488] [drtuber] Fix title extraction (closes #16107) --- youtube_dl/extractor/drtuber.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index c88b3126b..5c41c8022 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -66,7 +66,9 @@ class DrTuberIE(InfoExtractor): self._sort_formats(formats) title = self._html_search_regex( - (r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', + (r']+class=["\']title[^>]+>([^<]+)', + r'([^<]+)\s*@\s+DrTuber', + r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'), webpage, 'title') From ff826177cc154ba8c67b8162a25e067783dc4caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Apr 2018 23:57:32 +0700 Subject: [PATCH 032/488] [instagram:user] Fix extraction (closes #16119) --- youtube_dl/extractor/instagram.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index f9cd11b8e..9f570249f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -243,6 +243,8 @@ class InstagramUserIE(InfoExtractor): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) + self._set_cookie('instagram.com', 'ig_pr', '1') + cursor = '' for page_num in itertools.count(1): media = self._download_json( From 1c9b1a449430bd8b267c9c43ce7ed7cb73ac4433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 00:08:45 +0700 Subject: [PATCH 033/488] [acast] Fix extraction (closes #16118) --- youtube_dl/extractor/acast.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 5871e72dc..4ad549c92 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -7,7 +7,7 @@ import functools from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - int_or_none, + float_or_none, unified_timestamp, OnDemandPagedList, ) @@ -46,18 +46,22 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() + s = self._download_json( + 'https://play-api.acast.com/stitch/%s/%s' % (channel, display_id), + display_id)['result'] + media_url = s['url'] cast_data = self._download_json( 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), display_id) e = cast_data['result']['episode'] return { 'id': compat_str(e['id']), 'display_id': display_id, - 'url': e['mediaUrl'], + 'url': media_url, 'title': e['name'], 'description': e.get('description'), 'thumbnail': e.get('image'), 'timestamp': unified_timestamp(e.get('publishingDate')), - 'duration': int_or_none(e.get('duration')), + 'duration': float_or_none(s.get('duration') or e.get('duration')), } From cae5d9705c28ffc0bf5e149a5f92d31a48208e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 00:21:55 +0700 Subject: [PATCH 034/488] [acast] Extract more metadata --- youtube_dl/extractor/acast.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 4ad549c92..6d846ea7a 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,6 +8,8 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, + int_or_none, + try_get, unified_timestamp, OnDemandPagedList, ) @@ -24,23 +26,29 @@ class ACastIE(InfoExtractor): 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', 'ext': 'mp3', 'title': '"Where Are You?": Taipei 101, Taiwan', + 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'timestamp': 1196172000, 'upload_date': '20071127', - 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'duration': 211, + 'creator': 'Concierge', + 'series': 'Condé Nast Traveler Podcast', + 'episode': '"Where Are You?": Taipei 101, Taiwan', } }, { # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'e87d5b8516cd04c0d81b6ee1caca28d0', + 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', + 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', 'timestamp': 1477346700, 'upload_date': '20161024', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', - 'duration': 2766, + 'duration': 2766.602563, + 'creator': 'Anton Berg & Martin Johnson', + 'series': 'Spår', + 'episode': '2. Raggarmordet - Röster ur det förflutna', } }] @@ -51,17 +59,25 @@ class ACastIE(InfoExtractor): display_id)['result'] media_url = s['url'] cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), display_id) - e = cast_data['result']['episode'] + 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), + display_id)['result'] + e = cast_data['episode'] + title = e['name'] return { 'id': compat_str(e['id']), 'display_id': display_id, 'url': media_url, - 'title': e['name'], - 'description': e.get('description'), + 'title': title, + 'description': e.get('description') or e.get('summary'), 'thumbnail': e.get('image'), 'timestamp': unified_timestamp(e.get('publishingDate')), 'duration': float_or_none(s.get('duration') or e.get('duration')), + 'filesize': int_or_none(e.get('contentLength')), + 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), + 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), + 'season_number': int_or_none(e.get('seasonNumber')), + 'episode': title, + 'episode_number': int_or_none(e.get('episodeNumber')), } From 717ea4e14e59bded0c2fb20e84b6513d82644b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 00:29:43 +0700 Subject: [PATCH 035/488] [steam] Bypass mature content check (closes #16113) --- youtube_dl/extractor/steam.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index e5ac586a7..a6a191ceb 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -75,6 +75,9 @@ class SteamIE(InfoExtractor): gameID = m.group('gameID') playlist_id = gameID videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + + self._set_cookie('steampowered.com', 'mature_content', '1') + webpage = self._download_webpage(videourl, playlist_id) if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: From 66b686727b198a6b14ddcbcfdcbaadd5b203362f Mon Sep 17 00:00:00 2001 From: aeph6Ee0 <aeph6Ee0@users.noreply.github.com> Date: Sat, 7 Apr 2018 22:09:42 +0200 Subject: [PATCH 036/488] [extractor/common] Relax JSON-LD context check (closes #16006) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 890232586..59b9d3739 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1025,7 +1025,7 @@ class InfoExtractor(object): }) for e in json_ld: - if e.get('@context') == 'http://schema.org': + if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: return info From 608c738c7d8e6be21f0cc0bb7a844bad9d841964 Mon Sep 17 00:00:00 2001 From: GDR! <gdr@gdr.name> Date: Sun, 8 Apr 2018 17:13:00 +0200 Subject: [PATCH 037/488] [odnoklassniki] Extend _VALID_URL (closes #16081) --- youtube_dl/extractor/odnoklassniki.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 5c8b37e18..d87d0960f 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,7 +19,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer|live)/(?P<id>[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?/|web-api/video/moviePlayer/|live/|dk\?.*?st\.mvId=)(?P<id>[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -101,6 +101,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'https://www.ok.ru/live/484531969818', 'only_matching': True, + }, { + 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', + 'only_matching': True, }] def _real_extract(self, url): From d04ca9761615e2ed3fdf89d8d87a4b9adfffacc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 22:21:21 +0700 Subject: [PATCH 038/488] [odnoklassniki] Improve _VALID_URL readability --- youtube_dl/extractor/odnoklassniki.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index d87d0960f..190d8af4d 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,7 +19,18 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?/|web-api/video/moviePlayer/|live/|dk\?.*?st\.mvId=)(?P<id>[\d-]+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m|mobile)\.)? + (?:odnoklassniki|ok)\.ru/ + (?: + video(?:embed)?/| + web-api/video/moviePlayer/| + live/| + dk\?.*?st\.mvId= + ) + (?P<id>[\d-]+) + ''' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', From 1fc37ca3f181159c98bccf081766abb73b9d344f Mon Sep 17 00:00:00 2001 From: Surya Oktafendri <f2face@f2face.com> Date: Mon, 9 Apr 2018 00:19:23 +0700 Subject: [PATCH 039/488] [generic] Add support for share-videos.se embeds (closes #16089) --- youtube_dl/extractor/generic.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cf64398e3..4b210da72 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1967,6 +1967,16 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, + { + 'url': 'http://share-videos.se/auto/video/83645793?uid=13', + 'md5': 'b68d276de422ab07ee1d49388103f457', + 'info_dict': { + 'id': '83645793', + 'title': 'Lock up and get excited', + 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', + 'ext': 'mp4' + } } # { # # TODO: find another test @@ -2978,6 +2988,14 @@ class GenericIE(InfoExtractor): merged[k] = v return merged + # Look for Share-Videos.se embeds + sharevideosse_urls = [m.group('url') for m in re.finditer( + r'<iframe[^>]+?src\s*=\s*(["\'])(?P<url>https?://embed\.share-videos\.se/auto/embed/\d+.+?)\1', + webpage)] + if sharevideosse_urls: + return self.playlist_from_matches( + sharevideosse_urls, video_id, video_title) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: From d3431dcb90ea72fed502ecfd8f34e7499009a53a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 00:25:44 +0700 Subject: [PATCH 040/488] [generic] Restrict share-videos.se embeds regex to filter bogus URLs (#16115) --- youtube_dl/extractor/generic.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4b210da72..8922d1914 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1974,10 +1974,10 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '83645793', 'title': 'Lock up and get excited', - 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', 'ext': 'mp4' - } - } + }, + 'skip': 'TODO: fix nested playlists processing in tests', + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2973,6 +2973,13 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', + webpage)] + if sharevideos_urls: + return self.playlist_from_matches( + sharevideos_urls, video_id, video_title) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): @@ -2988,14 +2995,6 @@ class GenericIE(InfoExtractor): merged[k] = v return merged - # Look for Share-Videos.se embeds - sharevideosse_urls = [m.group('url') for m in re.finditer( - r'<iframe[^>]+?src\s*=\s*(["\'])(?P<url>https?://embed\.share-videos\.se/auto/embed/\d+.+?)\1', - webpage)] - if sharevideosse_urls: - return self.playlist_from_matches( - sharevideosse_urls, video_id, video_title) - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: From 069937151e429a2127569910d204c03eec167f0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 00:37:15 +0700 Subject: [PATCH 041/488] [generic] Add support for tube8 embeds --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/tube8.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8922d1914..e3cb5c5ce 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -58,6 +58,7 @@ from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE +from .tube8 import Tube8IE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -2556,6 +2557,11 @@ class GenericIE(InfoExtractor): if redtube_urls: return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) + # Look for embedded Tube8 player + tube8_urls = Tube8IE._extract_urls(webpage) + if tube8_urls: + return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 1853a1104..368c45729 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -31,6 +31,12 @@ class Tube8IE(KeezMoviesIE): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)', + webpage) + def _real_extract(self, url): webpage, info = self._extract_info(url) From 94c3442e6ae176b01b3b5eae0a3adc355319b569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 01:03:55 +0700 Subject: [PATCH 042/488] [YoutubeDL] Do not save/restore console title while simulate (closes #16103) --- youtube_dl/YoutubeDL.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 523dd1f7d..fca4999eb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -532,6 +532,8 @@ class YoutubeDL(object): def save_console_title(self): if not self.params.get('consoletitle', False): return + if self.params.get('simulate', False): + return if compat_os_name != 'nt' and 'TERM' in os.environ: # Save the title on stack self._write_string('\033[22;0t', self._screen_file) @@ -539,6 +541,8 @@ class YoutubeDL(object): def restore_console_title(self): if not self.params.get('consoletitle', False): return + if self.params.get('simulate', False): + return if compat_os_name != 'nt' and 'TERM' in os.environ: # Restore the title from stack self._write_string('\033[23;0t', self._screen_file) From 880ed89d491af9d85680777422c49e07f747e095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 01:14:47 +0700 Subject: [PATCH 043/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index 89dfbd8b8..9b01a8062 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version <unreleased> + +Core +* [YoutubeDL] Do not save/restore console title while simulate (#16103) +* [extractor/common] Relax JSON-LD context check (#16006) + +Extractors ++ [generic] Add support for tube8 embeds ++ [generic] Add support for share-videos.se embeds (#16089, #16115) +* [odnoklassniki] Extend URL regular expression (#16081) +* [steam] Bypass mature content check (#16113) ++ [acast] Extract more metadata +* [acast] Fix extraction (#16118) +* [instagram:user] Fix extraction (#16119) +* [drtuber] Fix title extraction (#16107, #16108) +* [liveleak] Extend URL regular expression (#16117) ++ [openload] Add support for oload.xyz +* [openload] Relax stream URL regular expression +* [openload] Fix extraction (#16099) ++ [svtplay:series] Add support for season URLs ++ [svtplay:series] Add support for series (#11130, #16059) + + version 2018.04.03 Extractors From f7f9757efcd4f5eaaa31e16ff14fc6627f515393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 01:19:27 +0700 Subject: [PATCH 044/488] release 2018.04.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 99e8acd33..ed622afd1 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.03** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.03 +[debug] youtube-dl version 2018.04.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 9b01a8062..4385c4091 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.04.09 Core * [YoutubeDL] Do not save/restore console title while simulate (#16103) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 17baac5ab..1c13199d4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -804,6 +804,7 @@ - **SunPorno** - **SVT** - **SVTPlay**: SVT Play and Öppet arkiv + - **SVTSeries** - **SWRMediathek** - **Syfy** - **SztvHu** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a3163509c..307d6041a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.03' +__version__ = '2018.04.09' From fce7962691a0f5874753cad431a8bb6ed31efc69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Apr 2018 23:07:37 +0700 Subject: [PATCH 045/488] [twitch] Add support for mobile URLs (closes #16146) --- youtube_dl/extractor/twitch.py | 47 ++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 1981b4d4a..f736283e9 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -28,7 +28,7 @@ from ..utils import ( class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:(?:www|go)\.)?twitch\.tv' + _VALID_URL_BASE = r'https?://(?:(?:www|go|m)\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' @@ -226,7 +226,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:(?:www|go)\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -279,6 +279,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'https://www.twitch.tv/videos/6528877', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', + 'only_matching': True, }] def _real_extract(self, url): @@ -390,14 +393,17 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE _PLAYLIST_TYPE = 'profile' - _TEST = { + _TESTS = [{ 'url': 'http://www.twitch.tv/vanillatv/profile', 'info_dict': { 'id': 'vanillatv', 'title': 'VanillaTV', }, 'playlist_mincount': 412, - } + }, { + 'url': 'http://m.twitch.tv/vanillatv/profile', + 'only_matching': True, + }] class TwitchVideosBaseIE(TwitchPlaylistBaseIE): @@ -411,14 +417,17 @@ class TwitchAllVideosIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight' _PLAYLIST_TYPE = 'all videos' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/all', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 869, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/all', + 'only_matching': True, + }] class TwitchUploadsIE(TwitchVideosBaseIE): @@ -427,14 +436,17 @@ class TwitchUploadsIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload' _PLAYLIST_TYPE = 'uploads' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/uploads', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 0, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/uploads', + 'only_matching': True, + }] class TwitchPastBroadcastsIE(TwitchVideosBaseIE): @@ -443,14 +455,17 @@ class TwitchPastBroadcastsIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive' _PLAYLIST_TYPE = 'past broadcasts' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 0, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/past-broadcasts', + 'only_matching': True, + }] class TwitchHighlightsIE(TwitchVideosBaseIE): @@ -459,14 +474,17 @@ class TwitchHighlightsIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight' _PLAYLIST_TYPE = 'highlights' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/highlights', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 805, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/highlights', + 'only_matching': True, + }] class TwitchStreamIE(TwitchBaseIE): @@ -474,7 +492,7 @@ class TwitchStreamIE(TwitchBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:(?:www|go)\.)?twitch\.tv/| + (?:(?:www|go|m)\.)?twitch\.tv/| player\.twitch\.tv/\?.*?\bchannel= ) (?P<id>[^/#?]+) @@ -508,6 +526,9 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'https://go.twitch.tv/food', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/food', + 'only_matching': True, }] @classmethod From dd9aea8cbdabc7622446d387ed6ed59e47b79de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 11 Apr 2018 01:25:41 +0700 Subject: [PATCH 046/488] [instagram:user] Add request signing (closes #16119) --- youtube_dl/extractor/instagram.py | 161 +++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 9f570249f..1c917bc95 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -2,14 +2,20 @@ from __future__ import unicode_literals import itertools import json +import os import re +import subprocess +import tempfile from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + check_executable, + ExtractorError, get_element_by_attribute, int_or_none, lowercase_escape, + std_headers, try_get, ) @@ -238,24 +244,140 @@ class InstagramUserIE(InfoExtractor): } } - def _entries(self, uploader_id): + _SIGN_CODE = ''' +"use strict"; +function i(e, t) { + var r = (65535 & e) + (65535 & t); + return (e >> 16) + (t >> 16) + (r >> 16) << 16 | 65535 & r +} +function a(e, t, r, n, o, a) { + return i((s = i(i(t, e), i(n, a))) << (c = o) | s >>> 32 - c, r); + var s, c +} +function s(e, t, r, n, o, i, s) { + return a(t & r | ~t & n, e, t, o, i, s) +} +function c(e, t, r, n, o, i, s) { + return a(t & n | r & ~n, e, t, o, i, s) +} +function u(e, t, r, n, o, i, s) { + return a(t ^ r ^ n, e, t, o, i, s) +} +function l(e, t, r, n, o, i, s) { + return a(r ^ (t | ~n), e, t, o, i, s) +} +function p(e, t) { + var r, n, o, a, p; + e[t >> 5] |= 128 << t % 32, + e[14 + (t + 64 >>> 9 << 4)] = t; + var d = 1732584193 + , f = -271733879 + , h = -1732584194 + , g = 271733878; + for (r = 0; r < e.length; r += 16) + n = d, + o = f, + a = h, + p = g, + f = l(f = l(f = l(f = l(f = u(f = u(f = u(f = u(f = c(f = c(f = c(f = c(f = s(f = s(f = s(f = s(f, h = s(h, g = s(g, d = s(d, f, h, g, e[r], 7, -680876936), f, h, e[r + 1], 12, -389564586), d, f, e[r + 2], 17, 606105819), g, d, e[r + 3], 22, -1044525330), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 4], 7, -176418897), f, h, e[r + 5], 12, 1200080426), d, f, e[r + 6], 17, -1473231341), g, d, e[r + 7], 22, -45705983), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 8], 7, 1770035416), f, h, e[r + 9], 12, -1958414417), d, f, e[r + 10], 17, -42063), g, d, e[r + 11], 22, -1990404162), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 12], 7, 1804603682), f, h, e[r + 13], 12, -40341101), d, f, e[r + 14], 17, -1502002290), g, d, e[r + 15], 22, 1236535329), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 1], 5, -165796510), f, h, e[r + 6], 9, -1069501632), d, f, e[r + 11], 14, 643717713), g, d, e[r], 20, -373897302), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 5], 5, -701558691), f, h, e[r + 10], 9, 38016083), d, f, e[r + 15], 14, -660478335), g, d, e[r + 4], 20, -405537848), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 9], 5, 568446438), f, h, e[r + 14], 9, -1019803690), d, f, e[r + 3], 14, -187363961), g, d, e[r + 8], 20, 1163531501), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 13], 5, -1444681467), f, h, e[r + 2], 9, -51403784), d, f, e[r + 7], 14, 1735328473), g, d, e[r + 12], 20, -1926607734), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 5], 4, -378558), f, h, e[r + 8], 11, -2022574463), d, f, e[r + 11], 16, 1839030562), g, d, e[r + 14], 23, -35309556), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 1], 4, -1530992060), f, h, e[r + 4], 11, 1272893353), d, f, e[r + 7], 16, -155497632), g, d, e[r + 10], 23, -1094730640), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 13], 4, 681279174), f, h, e[r], 11, -358537222), d, f, e[r + 3], 16, -722521979), g, d, e[r + 6], 23, 76029189), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 9], 4, -640364487), f, h, e[r + 12], 11, -421815835), d, f, e[r + 15], 16, 530742520), g, d, e[r + 2], 23, -995338651), h = l(h, g = l(g, d = l(d, f, h, g, e[r], 6, -198630844), f, h, e[r + 7], 10, 1126891415), d, f, e[r + 14], 15, -1416354905), g, d, e[r + 5], 21, -57434055), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 12], 6, 1700485571), f, h, e[r + 3], 10, -1894986606), d, f, e[r + 10], 15, -1051523), g, d, e[r + 1], 21, -2054922799), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 8], 6, 1873313359), f, h, e[r + 15], 10, -30611744), d, f, e[r + 6], 15, -1560198380), g, d, e[r + 13], 21, 1309151649), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 4], 6, -145523070), f, h, e[r + 11], 10, -1120210379), d, f, e[r + 2], 15, 718787259), g, d, e[r + 9], 21, -343485551), + d = i(d, n), + f = i(f, o), + h = i(h, a), + g = i(g, p); + return [d, f, h, g] +} +function d(e) { + var t, r = "", n = 32 * e.length; + for (t = 0; t < n; t += 8) + r += String.fromCharCode(e[t >> 5] >>> t % 32 & 255); + return r +} +function f(e) { + var t, r = []; + for (r[(e.length >> 2) - 1] = void 0, + t = 0; t < r.length; t += 1) + r[t] = 0; + var n = 8 * e.length; + for (t = 0; t < n; t += 8) + r[t >> 5] |= (255 & e.charCodeAt(t / 8)) << t % 32; + return r +} +function h(e) { + var t, r, n = ""; + for (r = 0; r < e.length; r += 1) + t = e.charCodeAt(r), + n += "0123456789abcdef".charAt(t >>> 4 & 15) + "0123456789abcdef".charAt(15 & t); + return n +} +function g(e) { + return unescape(encodeURIComponent(e)) +} +function b(e) { + return function(e) { + return d(p(f(e), 8 * e.length)) + }(g(e)) +} +function m(e, t) { + return function(e, t) { + var r, n, o = f(e), i = [], a = []; + for (i[15] = a[15] = void 0, + o.length > 16 && (o = p(o, 8 * e.length)), + r = 0; r < 16; r += 1) + i[r] = 909522486 ^ o[r], + a[r] = 1549556828 ^ o[r]; + return n = p(i.concat(f(t)), 512 + 8 * t.length), + d(p(a.concat(n), 640)) + }(g(e), g(t)) +} +function v(e, t, r) { + return t ? r ? m(t, e) : h(m(t, e)) : r ? b(e) : h(b(e)) +} +function sign(s) { + return v(s); +} +''' + + def _entries(self, data): def get_count(suffix): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) + uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + csrf_token = data['config']['csrf_token'] + rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' + self._set_cookie('instagram.com', 'ig_pr', '1') + def sign(s): + js_code = self._SIGN_CODE + "console.log(sign('%s')); phantom.exit();" % s + with open(self._phantomjs_script.name, 'w') as f: + f.write(js_code) + p = subprocess.Popen( + ['phantomjs', '--ssl-protocol=any', f.name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + gis, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Failed to sign request\n:' + err.decode('utf-8')) + return gis.decode('utf-8').strip() + cursor = '' for page_num in itertools.count(1): + variables = json.dumps({ + 'id': uploader_id, + 'first': 100, + 'after': cursor, + }) + gis = sign( + '%s:%s:%s:%s' + % (rhx_gis, csrf_token, std_headers['User-Agent'], variables)) media = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, - 'Downloading JSON page %d' % page_num, query={ + 'Downloading JSON page %d' % page_num, headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-Instagram-GIS': gis, + }, query={ 'query_hash': '472f257a40c653c64c666ce877d59d2b', - 'variables': json.dumps({ - 'id': uploader_id, - 'first': 100, - 'after': cursor, - }) + 'variables': variables, })['data']['user']['edge_owner_to_timeline_media'] edges = media.get('edges') @@ -309,11 +431,26 @@ class InstagramUserIE(InfoExtractor): if not cursor or not isinstance(cursor, compat_str): break + def _real_initialize(self): + if not check_executable('phantomjs', ['-v']): + raise ExtractorError( + 'PhantomJS executable not found in PATH, download it from http://phantomjs.org', + expected=True) + self._phantomjs_script = tempfile.NamedTemporaryFile(delete=False) + self._phantomjs_script.close() + + def __del__(self): + os.unlink(self._phantomjs_script.name) + def _real_extract(self, url): username = self._match_id(url) - uploader_id = self._download_json( - 'https://instagram.com/%s/' % username, username, query={ - '__a': 1, - })['graphql']['user']['id'] + + webpage = self._download_webpage(url, username) + + data = self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + username) + return self.playlist_result( - self._entries(uploader_id), username, username) + self._entries(data), username, username) From 315ab3d500964f1d8442135889e1886ca6d90100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 11 Apr 2018 01:51:57 +0700 Subject: [PATCH 047/488] [instagram:user] Simplify signing (#16119) --- youtube_dl/extractor/instagram.py | 128 +----------------------------- 1 file changed, 3 insertions(+), 125 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 1c917bc95..76452a6a1 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,17 +1,13 @@ from __future__ import unicode_literals import itertools +import hashlib import json -import os import re -import subprocess -import tempfile from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - check_executable, - ExtractorError, get_element_by_attribute, int_or_none, lowercase_escape, @@ -244,99 +240,6 @@ class InstagramUserIE(InfoExtractor): } } - _SIGN_CODE = ''' -"use strict"; -function i(e, t) { - var r = (65535 & e) + (65535 & t); - return (e >> 16) + (t >> 16) + (r >> 16) << 16 | 65535 & r -} -function a(e, t, r, n, o, a) { - return i((s = i(i(t, e), i(n, a))) << (c = o) | s >>> 32 - c, r); - var s, c -} -function s(e, t, r, n, o, i, s) { - return a(t & r | ~t & n, e, t, o, i, s) -} -function c(e, t, r, n, o, i, s) { - return a(t & n | r & ~n, e, t, o, i, s) -} -function u(e, t, r, n, o, i, s) { - return a(t ^ r ^ n, e, t, o, i, s) -} -function l(e, t, r, n, o, i, s) { - return a(r ^ (t | ~n), e, t, o, i, s) -} -function p(e, t) { - var r, n, o, a, p; - e[t >> 5] |= 128 << t % 32, - e[14 + (t + 64 >>> 9 << 4)] = t; - var d = 1732584193 - , f = -271733879 - , h = -1732584194 - , g = 271733878; - for (r = 0; r < e.length; r += 16) - n = d, - o = f, - a = h, - p = g, - f = l(f = l(f = l(f = l(f = u(f = u(f = u(f = u(f = c(f = c(f = c(f = c(f = s(f = s(f = s(f = s(f, h = s(h, g = s(g, d = s(d, f, h, g, e[r], 7, -680876936), f, h, e[r + 1], 12, -389564586), d, f, e[r + 2], 17, 606105819), g, d, e[r + 3], 22, -1044525330), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 4], 7, -176418897), f, h, e[r + 5], 12, 1200080426), d, f, e[r + 6], 17, -1473231341), g, d, e[r + 7], 22, -45705983), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 8], 7, 1770035416), f, h, e[r + 9], 12, -1958414417), d, f, e[r + 10], 17, -42063), g, d, e[r + 11], 22, -1990404162), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 12], 7, 1804603682), f, h, e[r + 13], 12, -40341101), d, f, e[r + 14], 17, -1502002290), g, d, e[r + 15], 22, 1236535329), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 1], 5, -165796510), f, h, e[r + 6], 9, -1069501632), d, f, e[r + 11], 14, 643717713), g, d, e[r], 20, -373897302), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 5], 5, -701558691), f, h, e[r + 10], 9, 38016083), d, f, e[r + 15], 14, -660478335), g, d, e[r + 4], 20, -405537848), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 9], 5, 568446438), f, h, e[r + 14], 9, -1019803690), d, f, e[r + 3], 14, -187363961), g, d, e[r + 8], 20, 1163531501), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 13], 5, -1444681467), f, h, e[r + 2], 9, -51403784), d, f, e[r + 7], 14, 1735328473), g, d, e[r + 12], 20, -1926607734), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 5], 4, -378558), f, h, e[r + 8], 11, -2022574463), d, f, e[r + 11], 16, 1839030562), g, d, e[r + 14], 23, -35309556), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 1], 4, -1530992060), f, h, e[r + 4], 11, 1272893353), d, f, e[r + 7], 16, -155497632), g, d, e[r + 10], 23, -1094730640), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 13], 4, 681279174), f, h, e[r], 11, -358537222), d, f, e[r + 3], 16, -722521979), g, d, e[r + 6], 23, 76029189), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 9], 4, -640364487), f, h, e[r + 12], 11, -421815835), d, f, e[r + 15], 16, 530742520), g, d, e[r + 2], 23, -995338651), h = l(h, g = l(g, d = l(d, f, h, g, e[r], 6, -198630844), f, h, e[r + 7], 10, 1126891415), d, f, e[r + 14], 15, -1416354905), g, d, e[r + 5], 21, -57434055), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 12], 6, 1700485571), f, h, e[r + 3], 10, -1894986606), d, f, e[r + 10], 15, -1051523), g, d, e[r + 1], 21, -2054922799), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 8], 6, 1873313359), f, h, e[r + 15], 10, -30611744), d, f, e[r + 6], 15, -1560198380), g, d, e[r + 13], 21, 1309151649), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 4], 6, -145523070), f, h, e[r + 11], 10, -1120210379), d, f, e[r + 2], 15, 718787259), g, d, e[r + 9], 21, -343485551), - d = i(d, n), - f = i(f, o), - h = i(h, a), - g = i(g, p); - return [d, f, h, g] -} -function d(e) { - var t, r = "", n = 32 * e.length; - for (t = 0; t < n; t += 8) - r += String.fromCharCode(e[t >> 5] >>> t % 32 & 255); - return r -} -function f(e) { - var t, r = []; - for (r[(e.length >> 2) - 1] = void 0, - t = 0; t < r.length; t += 1) - r[t] = 0; - var n = 8 * e.length; - for (t = 0; t < n; t += 8) - r[t >> 5] |= (255 & e.charCodeAt(t / 8)) << t % 32; - return r -} -function h(e) { - var t, r, n = ""; - for (r = 0; r < e.length; r += 1) - t = e.charCodeAt(r), - n += "0123456789abcdef".charAt(t >>> 4 & 15) + "0123456789abcdef".charAt(15 & t); - return n -} -function g(e) { - return unescape(encodeURIComponent(e)) -} -function b(e) { - return function(e) { - return d(p(f(e), 8 * e.length)) - }(g(e)) -} -function m(e, t) { - return function(e, t) { - var r, n, o = f(e), i = [], a = []; - for (i[15] = a[15] = void 0, - o.length > 16 && (o = p(o, 8 * e.length)), - r = 0; r < 16; r += 1) - i[r] = 909522486 ^ o[r], - a[r] = 1549556828 ^ o[r]; - return n = p(i.concat(f(t)), 512 + 8 * t.length), - d(p(a.concat(n), 640)) - }(g(e), g(t)) -} -function v(e, t, r) { - return t ? r ? m(t, e) : h(m(t, e)) : r ? b(e) : h(b(e)) -} -function sign(s) { - return v(s); -} -''' - def _entries(self, data): def get_count(suffix): return int_or_none(try_get( @@ -348,18 +251,6 @@ function sign(s) { self._set_cookie('instagram.com', 'ig_pr', '1') - def sign(s): - js_code = self._SIGN_CODE + "console.log(sign('%s')); phantom.exit();" % s - with open(self._phantomjs_script.name, 'w') as f: - f.write(js_code) - p = subprocess.Popen( - ['phantomjs', '--ssl-protocol=any', f.name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - gis, err = p.communicate() - if p.returncode != 0: - raise ExtractorError('Failed to sign request\n:' + err.decode('utf-8')) - return gis.decode('utf-8').strip() - cursor = '' for page_num in itertools.count(1): variables = json.dumps({ @@ -367,14 +258,12 @@ function sign(s) { 'first': 100, 'after': cursor, }) - gis = sign( - '%s:%s:%s:%s' - % (rhx_gis, csrf_token, std_headers['User-Agent'], variables)) + s = '%s:%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent'], variables) media = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ 'X-Requested-With': 'XMLHttpRequest', - 'X-Instagram-GIS': gis, + 'X-Instagram-GIS': hashlib.md5(s.encode('utf-8')).hexdigest(), }, query={ 'query_hash': '472f257a40c653c64c666ce877d59d2b', 'variables': variables, @@ -431,17 +320,6 @@ function sign(s) { if not cursor or not isinstance(cursor, compat_str): break - def _real_initialize(self): - if not check_executable('phantomjs', ['-v']): - raise ExtractorError( - 'PhantomJS executable not found in PATH, download it from http://phantomjs.org', - expected=True) - self._phantomjs_script = tempfile.NamedTemporaryFile(delete=False) - self._phantomjs_script.close() - - def __del__(self): - os.unlink(self._phantomjs_script.name) - def _real_extract(self, url): username = self._match_id(url) From d783aee56a720ce15cf2775afc330b2ed5d53baf Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Wed, 11 Apr 2018 09:11:24 -0400 Subject: [PATCH 048/488] [fxnetworks] Add support for https theplatform URLs (closes #16125) --- youtube_dl/extractor/fxnetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 37549fb01..00e67426b 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -41,7 +41,7 @@ class FXNetworksIE(AdobePassIE): if 'The content you are trying to access is not available in your region.' in webpage: self.raise_geo_restricted() video_data = extract_attributes(self._search_regex( - r'(<a.+?rel="http://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) + r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) release_url = video_data['rel'] title = video_data['data-title'] From 64f03e5b4c86f7c7e6d660267d77e02da621a94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 11 Apr 2018 23:28:55 +0700 Subject: [PATCH 049/488] [cbc:watch] Re-acquire device token when expired (closes #16160) --- youtube_dl/extractor/cbc.py | 59 +++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 3be0c646b..54b4b9be9 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -5,7 +5,10 @@ import json import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( js_to_json, smuggle_url, @@ -206,30 +209,48 @@ class CBCWatchBaseIE(InfoExtractor): def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path - result = self._download_xml(url, video_id, headers={ - 'X-Clearleap-DeviceId': self._device_id, - 'X-Clearleap-DeviceToken': self._device_token, - }) + for _ in range(2): + try: + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + # Device token has expired, re-acquiring device token + self._register_device() + continue + raise error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') if error_message: raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) return result def _real_initialize(self): - if not self._device_id or not self._device_token: - device = self._downloader.cache.load('cbcwatch', 'device') or {} - self._device_id, self._device_token = device.get('id'), device.get('token') - if not self._device_id or not self._device_token: - result = self._download_xml( - self._API_BASE_URL + 'device/register', - None, data=b'<device><type>web</type></device>') - self._device_id = xpath_text(result, 'deviceId', fatal=True) - self._device_token = xpath_text(result, 'deviceToken', fatal=True) - self._downloader.cache.store( - 'cbcwatch', 'device', { - 'id': self._device_id, - 'token': self._device_token, - }) + if self._valid_device_token(): + return + device = self._downloader.cache.load('cbcwatch', 'device') or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if self._valid_device_token(): + return + self._register_device() + + def _valid_device_token(self): + return self._device_id and self._device_token + + def _register_device(self): + self._device_id = self._device_token = None + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, 'Acquiring device token', + data=b'<device><type>web</type></device>') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', 'device', { + 'id': self._device_id, + 'token': self._device_token, + }) def _parse_rss_feed(self, rss): channel = xpath_element(rss, 'channel', fatal=True) From 92ded33a05e0760d0ae0e804a1dca0d8a84f3d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 12 Apr 2018 04:53:45 +0700 Subject: [PATCH 050/488] [pornhub] Relax _VALID_URLs (closes #16165) --- youtube_dl/extractor/pornhub.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9ce513aeb..23e24d216 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -264,7 +264,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -272,11 +272,14 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): 'title': 'Nataly Hot', }, 'playlist_mincount': 2, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, }] class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -305,6 +308,9 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): # Most Viewed Videos 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, }] def _real_extract(self, url): From 68ddba20ae4e0ab28146e80d3e112a5a2661c386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 13 Apr 2018 22:27:52 +0700 Subject: [PATCH 051/488] [instagram:user] Remove User-Agent from signature (closes #16119) --- youtube_dl/extractor/instagram.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 76452a6a1..8da1d5f2f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -11,7 +11,6 @@ from ..utils import ( get_element_by_attribute, int_or_none, lowercase_escape, - std_headers, try_get, ) @@ -258,7 +257,7 @@ class InstagramUserIE(InfoExtractor): 'first': 100, 'after': cursor, }) - s = '%s:%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent'], variables) + s = '%s:%s:%s' % (rhx_gis, csrf_token, variables) media = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ From 9b5aead6aa8ad82a5eecd2bc26c0e94399e92ca7 Mon Sep 17 00:00:00 2001 From: Timmy <hello@timmy.ws> Date: Sat, 14 Apr 2018 17:04:42 +0200 Subject: [PATCH 052/488] [vine:user] Fix extraction (closes #15514) --- youtube_dl/extractor/vine.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 46950d3a1..08ddffa66 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..utils import ( @@ -116,14 +115,14 @@ class VineUserIE(InfoExtractor): _VINE_BASE_URL = 'https://vine.co/' _TESTS = [ { - 'url': 'https://vine.co/Visa', + 'url': 'https://vine.co/itsruthb', 'info_dict': { - 'id': 'Visa', + 'id': 'itsruthb', }, - 'playlist_mincount': 46, + 'playlist_mincount': 611, }, { - 'url': 'https://vine.co/u/941705360593584128', + 'url': 'https://vine.co/u/942914934646415360', 'only_matching': True, }, ] @@ -139,16 +138,10 @@ class VineUserIE(InfoExtractor): profile_url, user, note='Downloading user profile data') user_id = profile_data['data']['userId'] - timeline_data = [] - for pagenum in itertools.count(1): - timeline_url = '%sapi/timelines/users/%s?page=%s&size=100' % ( - self._VINE_BASE_URL, user_id, pagenum) - timeline_page = self._download_json( - timeline_url, user, note='Downloading page %d' % pagenum) - timeline_data.extend(timeline_page['data']['records']) - if timeline_page['data']['nextPage'] is None: - break - + user_archive = self._download_json( + 'https://archive.vine.co/profiles/%s.json' % user_id, user_id) + posts = user_archive['posts'] entries = [ - self.url_result(e['permalinkUrl'], 'Vine') for e in timeline_data] + self.url_result('https://vine.co/v/%s' % post_id, 'Vine') + for post_id in posts] return self.playlist_result(entries, user) From 8e41c9ad01b6deda96c29f685c4d8861b8759ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 15 Apr 2018 22:43:25 +0700 Subject: [PATCH 053/488] [vine:user] Improve extraction (closes #16190) --- youtube_dl/extractor/vine.py | 45 +++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 08ddffa66..80b896b56 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, @@ -111,21 +112,24 @@ class VineIE(InfoExtractor): class VineUserIE(InfoExtractor): IE_NAME = 'vine:user' - _VALID_URL = r'(?:https?://)?vine\.co/(?P<u>u/)?(?P<user>[^/]+)/?(\?.*)?$' + _VALID_URL = r'https?://vine\.co/(?P<u>u/)?(?P<user>[^/]+)' _VINE_BASE_URL = 'https://vine.co/' - _TESTS = [ - { - 'url': 'https://vine.co/itsruthb', - 'info_dict': { - 'id': 'itsruthb', - }, - 'playlist_mincount': 611, + _TESTS = [{ + 'url': 'https://vine.co/itsruthb', + 'info_dict': { + 'id': 'itsruthb', + 'title': 'Ruth B', + 'description': '| Instagram/Twitter: itsruthb | still a lost boy from neverland', }, - { - 'url': 'https://vine.co/u/942914934646415360', - 'only_matching': True, - }, - ] + 'playlist_mincount': 611, + }, { + 'url': 'https://vine.co/u/942914934646415360', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if VineIE.suitable(url) else super(VineUserIE, cls).suitable(url) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -137,11 +141,14 @@ class VineUserIE(InfoExtractor): profile_data = self._download_json( profile_url, user, note='Downloading user profile data') - user_id = profile_data['data']['userId'] - user_archive = self._download_json( + data = profile_data['data'] + user_id = data.get('userId') or data['userIdStr'] + profile = self._download_json( 'https://archive.vine.co/profiles/%s.json' % user_id, user_id) - posts = user_archive['posts'] entries = [ - self.url_result('https://vine.co/v/%s' % post_id, 'Vine') - for post_id in posts] - return self.playlist_result(entries, user) + self.url_result( + 'https://vine.co/v/%s' % post_id, ie='Vine', video_id=post_id) + for post_id in profile['posts'] + if post_id and isinstance(post_id, compat_str)] + return self.playlist_result( + entries, user, profile.get('username'), profile.get('description')) From d6166a7602f5b78a4bb552ba0f4b176cbc0a4a03 Mon Sep 17 00:00:00 2001 From: Patrick Griffis <tingping@tingping.se> Date: Tue, 21 Mar 2017 00:49:31 +0200 Subject: [PATCH 054/488] [picarto] Add extractor --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/picarto.py | 87 ++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100755 youtube_dl/extractor/picarto.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9f60114d..d83e93dec 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -815,6 +815,10 @@ from .periscope import ( from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .picarto import ( + PicartoVodIE, + PicartoIE, +) from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py new file mode 100755 index 000000000..1d6f714ed --- /dev/null +++ b/youtube_dl/extractor/picarto.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, js_to_json, urlencode_postdata + + +class PicartoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)[^/]*$' + _TEST = { + 'url': 'https://picarto.tv/Setz', + 'info_dict': { + 'id': 'Setz', + 'ext': 'mp4', + 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'timestamp': int, + 'is_live': True + }, + 'params': { + 'skip_download': True + } + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + stream_page = self._download_webpage(url, channel_id) + + if 'This channel does not exist.' in stream_page: + raise ExtractorError('Channel does not exist', expected=True) + + player_settings_js = self._html_search_regex( + r'(?s)playerSettings\[1\]\s*=\s*(\{.+?\}\n)', stream_page, 'player-settings') + player_settings = self._parse_json(player_settings_js, channel_id, + transform_source=js_to_json) + if not player_settings.get('online'): + raise ExtractorError('Stream is offline', expected=True) + + cdn_data = self._download_json('https://picarto.tv/process/channel', channel_id, + data=urlencode_postdata({'loadbalancinginfo': channel_id}), + note='Fetching load balancer info') + edge = [edge['ep'] for edge in cdn_data['edges'] if edge['id'] == cdn_data['preferedEdge']][0] + + formats = self._extract_m3u8_formats('https://%s/hls/%s/index.m3u8' % (edge, channel_id), + channel_id, 'mp4') + formats.append({'url': 'https://%s/mp4/%s.mp4' % (edge, channel_id)}) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'formats': formats, + 'ext': 'mp4', + 'title': self._live_title(channel_id), + 'is_live': True, + 'thumbnail': player_settings.get('vodThumb'), + 'age_limit': 18 if player_settings.get('mature') else None, + } + + +class PicartoVodIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[a-zA-Z0-9_\-\.]+).flv' + _TEST = { + 'url': 'https://picarto.tv/videopopout/Carrot_2018.01.11.07.55.12.flv', + 'md5': '80765b67813053ff31d4df2bd5e900ce', + 'info_dict': { + 'id': 'Carrot_2018.01.11.07.55.12', + 'ext': 'mp4', + 'title': 'Carrot_2018.01.11.07.55.12', + 'thumbnail': r're:^https?://.*\.jpg$' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + vod_info_js = self._html_search_regex(r'(?s)"#vod-player",\s*(\{.+?\})\)', + webpage, video_id) + vod_info = self._parse_json(vod_info_js, video_id, transform_source=js_to_json) + + return { + 'id': video_id, + 'title': video_id, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'url': vod_info['vod'], + 'thumbnail': vod_info.get('vodThumb'), + } From a42839e548d81ae20e5164ae690075d2c423477e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 00:31:25 +0700 Subject: [PATCH 055/488] [picarto] Improve extraction (closes #6205, closes #12514, closes #15276, closes #15551) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/picarto.py | 152 ++++++++++++++++++++++------- 2 files changed, 116 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d83e93dec..3570fa165 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -816,8 +816,8 @@ from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .picarto import ( - PicartoVodIE, PicartoIE, + PicartoVodIE, ) from .piksel import PikselIE from .pinkbike import PinkbikeIE diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 1d6f714ed..2366dfb34 100755 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,12 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import time + from .common import InfoExtractor -from ..utils import ExtractorError, js_to_json, urlencode_postdata +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + try_get, + update_url_query, + urlencode_postdata, +) class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)[^/]*$' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -16,72 +25,141 @@ class PicartoIE(InfoExtractor): 'timestamp': int, 'is_live': True }, - 'params': { - 'skip_download': True - } + 'skip': 'Stream is offline', } + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + def _real_extract(self, url): channel_id = self._match_id(url) stream_page = self._download_webpage(url, channel_id) - if 'This channel does not exist.' in stream_page: - raise ExtractorError('Channel does not exist', expected=True) + if '>This channel does not exist' in stream_page: + raise ExtractorError( + 'Channel %s does not exist' % channel_id, expected=True) - player_settings_js = self._html_search_regex( - r'(?s)playerSettings\[1\]\s*=\s*(\{.+?\}\n)', stream_page, 'player-settings') - player_settings = self._parse_json(player_settings_js, channel_id, - transform_source=js_to_json) - if not player_settings.get('online'): + player = self._parse_json( + self._search_regex( + r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page, + 'player settings'), + channel_id, transform_source=js_to_json) + + if player.get('online') is False: raise ExtractorError('Stream is offline', expected=True) - cdn_data = self._download_json('https://picarto.tv/process/channel', channel_id, + cdn_data = self._download_json( + 'https://picarto.tv/process/channel', channel_id, data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Fetching load balancer info') - edge = [edge['ep'] for edge in cdn_data['edges'] if edge['id'] == cdn_data['preferedEdge']][0] + note='Downloading load balancing info') - formats = self._extract_m3u8_formats('https://%s/hls/%s/index.m3u8' % (edge, channel_id), - channel_id, 'mp4') - formats.append({'url': 'https://%s/mp4/%s.mp4' % (edge, channel_id)}) + def get_event(key): + return try_get(player, lambda x: x['event'][key], compat_str) or '' + + params = { + 'token': player.get('token') or '', + 'ticket': get_event('ticket'), + 'con': int(time.time() * 1000), + 'type': get_event('ticket'), + 'scope': get_event('scope'), + } + + prefered_edge = cdn_data.get('preferedEdge') + default_tech = player.get('defaultTech') + + formats = [] + + for edge in cdn_data['edges']: + edge_ep = edge.get('ep') + if not edge_ep or not isinstance(edge_ep, compat_str): + continue + edge_id = edge.get('id') + for tech in cdn_data['techs']: + tech_label = tech.get('label') + tech_type = tech.get('type') + preference = 0 + if edge_id == prefered_edge: + preference += 1 + if tech_type == default_tech: + preference += 1 + format_id = [] + if edge_id: + format_id.append(edge_id) + if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': + format_id.append('hls') + formats.extend(self._extract_m3u8_formats( + update_url_query( + 'https://%s/hls/%s/index.m3u8' + % (edge_ep, channel_id), params), + channel_id, 'mp4', preference=preference, + m3u8_id='-'.join(format_id), fatal=False)) + continue + elif tech_type == 'video/mp4' or tech_label == 'MP4': + format_id.append('mp4') + formats.append({ + 'url': update_url_query( + 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), + params), + 'format_id': '-'.join(format_id), + 'preference': preference, + }) + else: + # rtmp format does not seem to work + continue self._sort_formats(formats) + mature = player.get('mature') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + return { 'id': channel_id, - 'formats': formats, - 'ext': 'mp4', 'title': self._live_title(channel_id), 'is_live': True, - 'thumbnail': player_settings.get('vodThumb'), - 'age_limit': 18 if player_settings.get('mature') else None, + 'thumbnail': player.get('vodThumb'), + 'age_limit': age_limit, + 'formats': formats, } class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[a-zA-Z0-9_\-\.]+).flv' - _TEST = { - 'url': 'https://picarto.tv/videopopout/Carrot_2018.01.11.07.55.12.flv', - 'md5': '80765b67813053ff31d4df2bd5e900ce', + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', 'info_dict': { - 'id': 'Carrot_2018.01.11.07.55.12', + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', 'ext': 'mp4', - 'title': 'Carrot_2018.01.11.07.55.12', - 'thumbnail': r're:^https?://.*\.jpg$' - } - } + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - vod_info_js = self._html_search_regex(r'(?s)"#vod-player",\s*(\{.+?\})\)', - webpage, video_id) - vod_info = self._parse_json(vod_info_js, video_id, transform_source=js_to_json) + vod_info = self._parse_json( + self._search_regex( + r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, + video_id), + video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats( + vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) return { 'id': video_id, 'title': video_id, - 'ext': 'mp4', - 'protocol': 'm3u8', - 'url': vod_info['vod'], 'thumbnail': vod_info.get('vodThumb'), + 'formats': formats, } From c07cb68e7974a2ecd94f4101e6f094414df16e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 00:54:21 +0700 Subject: [PATCH 056/488] [smotri:broadcast] Fix extraction (closes #16180) --- youtube_dl/extractor/smotri.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 370fa8879..45995f30f 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -310,6 +310,7 @@ class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' + _NETRC_MACHINE = 'smotri' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -352,17 +353,18 @@ class SmotriBroadcastIE(InfoExtractor): adult_content = False ticket = self._html_search_regex( - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)", - broadcast_page, 'broadcast ticket') + (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), + broadcast_page, 'broadcast ticket', group='ticket') - url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket broadcast_password = self._downloader.params.get('videopassword') if broadcast_password: - url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() broadcast_json_page = self._download_webpage( - url, broadcast_id, 'Downloading broadcast JSON') + broadcast_url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) From 0e6ccb3905cb86c53a91af4c9119e2fd102019d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 00:56:05 +0700 Subject: [PATCH 057/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4385c4091..12bda4951 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version <unreleased> + +Extractors +* [smotri:broadcast] Fix extraction (#16180) ++ [picarto] Add support for picarto.tv (#6205, #12514, #15276, #15551) +* [vine:user] Fix extraction (#15514, #16190) +* [pornhub] Relax URL regular expression (#16165) +* [cbc:watch] Re-acquire device token when expired (#16160) ++ [fxnetworks] Add support for https theplatform URLs (#16125, #16157) ++ [instagram:user] Add request signing (#16119) ++ [twitch] Add support for mobile URLs (#16146) + + version 2018.04.09 Core From bdf7ba6f3a626b4c873257091d0771e54bd02dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 01:07:21 +0700 Subject: [PATCH 058/488] Set chmod 644 for all extractors --- youtube_dl/extractor/americastestkitchen.py | 0 youtube_dl/extractor/cda.py | 0 youtube_dl/extractor/joj.py | 0 youtube_dl/extractor/picarto.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 youtube_dl/extractor/americastestkitchen.py mode change 100755 => 100644 youtube_dl/extractor/cda.py mode change 100755 => 100644 youtube_dl/extractor/joj.py mode change 100755 => 100644 youtube_dl/extractor/picarto.py diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py old mode 100755 new mode 100644 diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py old mode 100755 new mode 100644 diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py old mode 100755 new mode 100644 diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py old mode 100755 new mode 100644 From 3c92fd1cd5b5ced11f03ebe64104457c21cd69ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 01:09:18 +0700 Subject: [PATCH 059/488] release 2018.04.16 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ed622afd1..69f996179 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.16*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.16** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.09 +[debug] youtube-dl version 2018.04.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 12bda4951..185fa1753 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.04.16 Extractors * [smotri:broadcast] Fix extraction (#16180) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1c13199d4..715d16cfe 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -628,6 +628,8 @@ - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** + - **Picarto** + - **PicartoVod** - **Piksel** - **Pinkbike** - **Pladform** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 307d6041a..5aefdd0a2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.09' +__version__ = '2018.04.16' From 522d6b5c961f584055463f8c69de864ec075083b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Apr 2018 07:48:36 +0100 Subject: [PATCH 060/488] [cbs] skip DRM asset types(fixes #16104) --- youtube_dl/extractor/cbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index f425562ab..1799d63ea 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -65,7 +65,7 @@ class CBSIE(CBSBaseIE): last_e = None for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') - if not asset_type or asset_type in asset_types: + if not asset_type or asset_type in asset_types or asset_type in ('HLS_FPS', 'DASH_CENC'): continue asset_types.append(asset_type) query = { From 238d42cf5d4b1a95ba42bf56dcb1bf559ac11c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Apr 2018 22:37:50 +0700 Subject: [PATCH 061/488] [instagram:user] Fix extraction (closes #16119) --- youtube_dl/extractor/instagram.py | 49 ++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 8da1d5f2f..5cea37d92 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -6,11 +6,16 @@ import json import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( + ExtractorError, get_element_by_attribute, int_or_none, lowercase_escape, + std_headers, try_get, ) @@ -239,6 +244,8 @@ class InstagramUserIE(InfoExtractor): } } + _gis_tmpl = None + def _entries(self, data): def get_count(suffix): return int_or_none(try_get( @@ -257,16 +264,36 @@ class InstagramUserIE(InfoExtractor): 'first': 100, 'after': cursor, }) - s = '%s:%s:%s' % (rhx_gis, csrf_token, variables) - media = self._download_json( - 'https://www.instagram.com/graphql/query/', uploader_id, - 'Downloading JSON page %d' % page_num, headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-Instagram-GIS': hashlib.md5(s.encode('utf-8')).hexdigest(), - }, query={ - 'query_hash': '472f257a40c653c64c666ce877d59d2b', - 'variables': variables, - })['data']['user']['edge_owner_to_timeline_media'] + + if self._gis_tmpl: + gis_tmpls = [self._gis_tmpl] + else: + gis_tmpls = [ + '%s' % rhx_gis, + '', + '%s:%s' % (rhx_gis, csrf_token), + '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), + ] + + for gis_tmpl in gis_tmpls: + try: + media = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, + 'Downloading JSON page %d' % page_num, headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-Instagram-GIS': hashlib.md5( + ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), + }, query={ + 'query_hash': '42323d64886122307be10013ad2dcc44', + 'variables': variables, + })['data']['user']['edge_owner_to_timeline_media'] + self._gis_tmpl = gis_tmpl + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if gis_tmpl != gis_tmpls[-1]: + continue + raise edges = media.get('edges') if not edges or not isinstance(edges, list): From 518d5ba5191e3cc26c81e346ba5117e94db51469 Mon Sep 17 00:00:00 2001 From: Dan Salmon <sa7mon@users.noreply.github.com> Date: Tue, 17 Apr 2018 12:10:02 -0500 Subject: [PATCH 062/488] Fix some tests --- test/test_subtitles.py | 4 ++-- test/test_youtube_lists.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 1b8de822a..7d57a628e 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -232,7 +232,7 @@ class TestNPOSubtitles(BaseTestSubtitles): class TestMTVSubtitles(BaseTestSubtitles): - url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' + url = 'http://www.cc.com/video-clips/p63lk0/adam-devine-s-house-party-chasing-white-swans' IE = ComedyCentralIE def getInfoDict(self): @@ -243,7 +243,7 @@ class TestMTVSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') + self.assertEqual(md5(subtitles['en']), '78206b8d8a0cfa9da64dc026eea48961') class TestNRKSubtitles(BaseTestSubtitles): diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 7a33dbf88..c4f0abbea 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -61,7 +61,7 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() dl.params['extract_flat'] = True ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv') self.assertIsPlaylist(result) for entry in result['entries']: self.assertTrue(entry.get('title')) From e30991f9206f98605ba6c4880ed40ad5556fa0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Apr 2018 01:24:02 +0700 Subject: [PATCH 063/488] [kaltura] Improve embeds detection (closes #16201) --- youtube_dl/extractor/generic.py | 18 +++++++++++++++++- youtube_dl/extractor/kaltura.py | 6 +++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e3cb5c5ce..af1322e00 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1220,7 +1220,7 @@ class GenericIE(InfoExtractor): 'title': '35871', 'timestamp': 1355743100, 'upload_date': '20121217', - 'uploader_id': 'batchUser', + 'uploader_id': 'cplapp@learn360.com', }, 'add_ie': ['Kaltura'], }, @@ -1271,6 +1271,22 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # meta twitter:player + 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', + 'info_dict': { + 'id': '0_01b42zps', + 'ext': 'mp4', + 'title': 'Main Twerk (Video)', + 'upload_date': '20171208', + 'uploader_id': 'sebastian.salinas@thechive.com', + 'timestamp': 1512713057, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, # referrer protected EaglePlatform embed { 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 562e25f6d..0ea89e4d6 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -135,10 +135,10 @@ class KalturaIE(InfoExtractor): ''', webpage) or re.search( r'''(?xs) - <iframe[^>]+src=(?P<q1>["']) - (?:https?:)?//(?:www\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["']) + (?:https?:)?//(?:(?:www|cdnapi)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) (?:(?!(?P=q1)).)* - [?&]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) + [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) (?P=q1) ''', webpage) ) From 9b3036bd2e431ed4b037a3df21528a6e9bcb05b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Apr 2018 10:12:24 +0700 Subject: [PATCH 064/488] [instagram:user] Fix extraction (closes #16119) --- youtube_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 5cea37d92..0c13f54ee 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -261,7 +261,7 @@ class InstagramUserIE(InfoExtractor): for page_num in itertools.count(1): variables = json.dumps({ 'id': uploader_id, - 'first': 100, + 'first': 12, 'after': cursor, }) From b004d9bbf18ee2b6a9b916657c4d6734ff0d0adb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Apr 2018 15:07:50 +0100 Subject: [PATCH 065/488] [cbssports] fix extraction(fixes #16217) --- youtube_dl/extractor/cbssports.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 3a62c840b..27a243d08 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -4,28 +4,33 @@ from .cbs import CBSBaseIE class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/video/player/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', + 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', 'info_dict': { - 'id': '708337219968', + 'id': '1214315075735', 'ext': 'mp4', - 'title': 'Ben Simmons the next LeBron? Not so fast', - 'description': 'md5:854294f627921baba1f4b9a990d87197', - 'timestamp': 1466293740, - 'upload_date': '20160618', + 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', + 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', + 'timestamp': 1524111457, + 'upload_date': '20180419', 'uploader': 'CBSI-NEW', }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'only_matching': True, }] def _extract_video_info(self, filter_query, video_id): return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id= self._search_regex([r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], webpage, 'video id') return self._extract_video_info('byId=%s' % video_id, video_id) From d86c5167ae5a1c33451e98d7e05d5b32b6fa3156 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Apr 2018 15:48:03 +0100 Subject: [PATCH 066/488] [nexx] extract new azure urls(closes #16223) --- youtube_dl/extractor/nexx.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index c7029d29e..5e46a75c0 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -230,15 +230,18 @@ class NexxIE(InfoExtractor): azure_locator = stream_data['azureLocator'] - AZURE_URL = 'http://nx%s%02d.akamaized.net/' - - def get_cdn_shield_base(shield_type='', prefix='-p'): + def get_cdn_shield_base(shield_type='', static=False): for secure in ('', 's'): cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) if cdn_shield: return 'http%s://%s' % (secure, cdn_shield) else: - return AZURE_URL % (prefix, int(stream_data['azureAccount'].replace('nexxplayplus', ''))) + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) azure_stream_base = get_cdn_shield_base() is_ml = ',' in language @@ -260,7 +263,7 @@ class NexxIE(InfoExtractor): formats.extend(self._extract_ism_formats( azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - azure_progressive_base = get_cdn_shield_base('Prog', '-d') + azure_progressive_base = get_cdn_shield_base('Prog', True) azure_file_distribution = stream_data.get('azureFileDistribution') if azure_file_distribution: fds = azure_file_distribution.split(',') From 5a19d231ca8e15d07c2a5ebd3cd6cc46b7596edc Mon Sep 17 00:00:00 2001 From: Douglas Su <d0u9.su@outlook.com> Date: Thu, 19 Apr 2018 23:21:50 +0800 Subject: [PATCH 067/488] [YoutubeDL] Fix typo in media extension compatibility checker --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fca4999eb..ad3598805 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1853,7 +1853,7 @@ class YoutubeDL(object): def compatible_formats(formats): video, audio = formats # Check extension - video_ext, audio_ext = audio.get('ext'), video.get('ext') + video_ext, audio_ext = video.get('ext'), audio.get('ext') if video_ext and audio_ext: COMPATIBLE_EXTS = ( ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), From 1792bc3a06dbdb788d12a1e6a4a8d7072be70edb Mon Sep 17 00:00:00 2001 From: Parmjit Virk <pvirk@mts.net> Date: Thu, 19 Apr 2018 10:25:51 -0500 Subject: [PATCH 068/488] [keezmovies] Add support for generic embeds (closes #16134) --- youtube_dl/extractor/keezmovies.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index e83115e2a..d4e6f7ac1 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -20,23 +20,23 @@ from ..utils import ( class KeezMoviesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', - 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', + 'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681', + 'md5': '2ac69cdb882055f71d82db4311732a1a', 'info_dict': { - 'id': '1214711', - 'display_id': 'petite-asian-lady-mai-playing-in-bathtub', + 'id': '18070681', + 'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money', 'ext': 'mp4', - 'title': 'Petite Asian Lady Mai Playing In Bathtub', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Arab wife want it so bad I see she thirsty and has tiny money.', + 'thumbnail': None, 'view_count': int, 'age_limit': 18, } }, { - 'url': 'http://www.keezmovies.com/video/1214711', + 'url': 'http://www.keezmovies.com/video/18070681', 'only_matching': True, }] - def _extract_info(self, url): + def _extract_info(self, url, fatal=True): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = (mobj.group('display_id') @@ -55,7 +55,7 @@ class KeezMoviesIE(InfoExtractor): encrypted = False def extract_format(format_url, height=None): - if not isinstance(format_url, compat_str) or not format_url.startswith('http'): + if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): return if format_url in format_urls: return @@ -105,7 +105,11 @@ class KeezMoviesIE(InfoExtractor): raise ExtractorError( 'Video %s is no longer available' % video_id, expected=True) - self._sort_formats(formats) + try: + self._sort_formats(formats) + except ExtractorError: + if fatal: + raise if not title: title = self._html_search_regex( @@ -122,7 +126,9 @@ class KeezMoviesIE(InfoExtractor): } def _real_extract(self, url): - webpage, info = self._extract_info(url) + webpage, info = self._extract_info(url, fatal=False) + if not info['formats']: + return self.url_result(url, 'Generic') info['view_count'] = str_to_int(self._search_regex( r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) return info From d317973284f6d9886bda0bf8215ffb4f060af41d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Apr 2018 22:36:33 +0700 Subject: [PATCH 069/488] [extremetube] Fix metadata extraction --- youtube_dl/extractor/extremetube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 445f9438d..acd4090fa 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -8,12 +8,12 @@ class ExtremeTubeIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', + 'md5': '92feaafa4b58e82f261e5419f39c60cb', 'info_dict': { 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', - 'uploader': 'unknown', + 'uploader': 'anonim', 'view_count': int, 'age_limit': 18, } @@ -36,10 +36,10 @@ class ExtremeTubeIE(KeezMoviesIE): r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( - r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>', + r'Uploaded by:\s*</[^>]+>\s*<a[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) view_count = str_to_int(self._search_regex( - r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', + r'Views:\s*</[^>]+>\s*<[^>]+>([\d,\.]+)</', webpage, 'view count', fatal=False)) info.update({ From c19420027768c80a6a375f5a517fde9f65c1fc31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Apr 2018 22:38:31 +0700 Subject: [PATCH 070/488] [mofosex] Fix test --- youtube_dl/extractor/mofosex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 54716f5c7..1c652813a 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -12,7 +12,7 @@ class MofosexIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html' _TESTS = [{ 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', - 'md5': '39a15853632b7b2e5679f92f69b78e91', + 'md5': '558fcdafbb63a87c019218d6e49daf8a', 'info_dict': { 'id': '318131', 'display_id': 'amateur-teen-playing-and-masturbating-318131', From d65a48a0efd2184f7b2fdc823433f568bae56d86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Apr 2018 23:12:13 +0700 Subject: [PATCH 071/488] [nick] Add support for nickjr.nl (closes #16230) --- youtube_dl/extractor/nick.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 090f1acee..256a24d86 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -81,13 +81,23 @@ class NickIE(MTVServicesInfoExtractor): class NickBrIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeon:br' - _VALID_URL = r'https?://(?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br/(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?#.]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br| + (?:www\.)?nickjr\.nl + ) + /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?\#.]+) + ''' _TESTS = [{ 'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/', 'only_matching': True, }, { 'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j', 'only_matching': True, + }, { + 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', + 'only_matching': True, }] def _real_extract(self, url): From 4b8588fe0215fb5ea75d4f37402ec51014cb8c53 Mon Sep 17 00:00:00 2001 From: einstein95 <einstein95@users.noreply.github.com> Date: Fri, 12 Jan 2018 07:01:02 +1300 Subject: [PATCH 072/488] [rentv] Fix extraction --- youtube_dl/extractor/rentv.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index d338b3a93..df528b09e 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -26,9 +26,20 @@ class RENTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) - jw_config = self._parse_json(self._search_regex( - r'config\s*=\s*({.+});', webpage, 'jw config'), video_id) - return self._parse_jwplayer_data(jw_config, video_id, m3u8_id='hls') + config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+});', webpage, 'config'), video_id) + formats = [] + for video in config.get('src', ''): + formats.append({ + 'url': video.get('src', '') + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'title': config.get('title', ''), + 'thumbnail': config.get('image', '') + } class RENTVArticleIE(InfoExtractor): From a693386df1957ba03cbf5156a65dd18b2c37ac42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Apr 2018 23:22:10 +0700 Subject: [PATCH 073/488] [rentv] Improve extraction (closes #15227) --- youtube_dl/extractor/rentv.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index df528b09e..8bcf87126 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, +) class RENTVIE(InfoExtractor): @@ -13,7 +17,9 @@ class RENTVIE(InfoExtractor): 'info_dict': { 'id': '118577', 'ext': 'mp4', - 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"' + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', + 'timestamp': 1472230800, + 'upload_date': '20160826', } }, { 'url': 'http://ren.tv/player/118577', @@ -27,18 +33,31 @@ class RENTVIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) config = self._parse_json(self._search_regex( - r'config\s*=\s*({.+});', webpage, 'config'), video_id) + r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) + title = config['title'] formats = [] - for video in config.get('src', ''): - formats.append({ - 'url': video.get('src', '') - }) + for video in config['src']: + src = video.get('src') + if not src or not isinstance(src, compat_str): + continue + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) self._sort_formats(formats) return { 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'duration': int_or_none(config.get('duration')), + 'timestamp': int_or_none(config.get('date')), 'formats': formats, - 'title': config.get('title', ''), - 'thumbnail': config.get('image', '') } From 040c6296bb9da495d37ba134baee996a3a97b64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 04:55:35 +0700 Subject: [PATCH 074/488] [ccma] Fix video extraction (closes #15931) --- youtube_dl/extractor/ccma.py | 50 +++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index bec0a825a..07f5206c1 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + clean_html, int_or_none, parse_duration, parse_iso8601, - clean_html, + parse_resolution, ) @@ -40,34 +42,42 @@ class CCMAIE(InfoExtractor): def _real_extract(self, url): media_type, media_id = re.match(self._VALID_URL, url).groups() - media_data = {} - formats = [] - profiles = ['pc'] if media_type == 'audio' else ['mobil', 'pc'] - for i, profile in enumerate(profiles): - md = self._download_json('http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + + media = self._download_json( + 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, - 'profile': profile, - }, fatal=False) - if md: - media_data = md - media_url = media_data.get('media', {}).get('url') - if media_url: - formats.append({ - 'format_id': profile, - 'url': media_url, - 'quality': i, - }) + }) + + formats = [] + media_url = media['media']['url'] + if isinstance(media_url, list): + for format_ in media_url: + format_url = format_.get('file') + if not format_url or not isinstance(format_url, compat_str): + continue + label = format_.get('label') + f = parse_resolution(label) + f.update({ + 'url': format_url, + 'format_id': label, + }) + formats.append(f) + else: + formats.append({ + 'url': media_url, + 'vcodec': 'none' if media_type == 'audio' else None, + }) self._sort_formats(formats) - informacio = media_data['informacio'] + informacio = media['informacio'] title = informacio['titol'] durada = informacio.get('durada', {}) duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) subtitles = {} - subtitols = media_data.get('subtitols', {}) + subtitols = media.get('subtitols', {}) if subtitols: sub_url = subtitols.get('url') if sub_url: @@ -77,7 +87,7 @@ class CCMAIE(InfoExtractor): }) thumbnails = [] - imatges = media_data.get('imatges', {}) + imatges = media.get('imatges', {}) if imatges: thumbnail_url = imatges.get('url') if thumbnail_url: From 353f0bde78eb4dd9432c092f244bb30b2abd7f70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 04:57:22 +0700 Subject: [PATCH 075/488] [cbssports] PEP 8 --- youtube_dl/extractor/cbssports.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 27a243d08..83b764762 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -32,5 +32,7 @@ class CBSSportsIE(CBSBaseIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id= self._search_regex([r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], webpage, 'video id') + video_id = self._search_regex( + [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], + webpage, 'video id') return self._extract_video_info('byId=%s' % video_id, video_id) From 488ff2dd3a193544a9912776d1c1b9d9fffc8fe7 Mon Sep 17 00:00:00 2001 From: 0x9fff00 <0x9fff00+git@protonmail.ch> Date: Sat, 17 Mar 2018 16:14:20 +0100 Subject: [PATCH 076/488] [svt] Add support for TV channel live streams (Closes #15279) --- youtube_dl/extractor/svt.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index b544da414..d01f85422 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -22,6 +22,8 @@ class SVTBaseIE(InfoExtractor): _GEO_COUNTRIES = ['SE'] def _extract_video(self, video_info, video_id): + is_live = dict_get(video_info, ('live', 'simulcast'), default=False) + m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' formats = [] for vr in video_info['videoReferences']: player_type = vr.get('playerType') or vr.get('format') @@ -30,7 +32,7 @@ class SVTBaseIE(InfoExtractor): if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, - ext='mp4', entry_protocol='m3u8_native', + ext='mp4', entry_protocol=m3u8_protocol, m3u8_id=player_type, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( @@ -90,6 +92,7 @@ class SVTBaseIE(InfoExtractor): 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, + 'is_live': is_live, } @@ -134,7 +137,7 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>\w+)' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -158,6 +161,9 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/kanaler/svt1', + 'only_matching': True, }] def _real_extract(self, url): @@ -183,6 +189,8 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) + if info_dict['is_live']: + info_dict['title'] = self._live_title(info_dict['title']) return info_dict video_id = self._search_regex( @@ -198,6 +206,8 @@ class SVTPlayIE(SVTPlayBaseIE): info_dict['title'] = re.sub( r'\s*\|\s*.+?$', '', info_dict.get('episode') or self._og_search_title(webpage)) + if info_dict['is_live']: + info_dict['title'] = self._live_title(info_dict['title']) return info_dict From 6cdaaf703149f1d6f1d24cfdb5a538ca41d08a26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 05:33:08 +0700 Subject: [PATCH 077/488] [svt] Improve (closes #15809) --- youtube_dl/extractor/svt.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index d01f85422..f71eab8b2 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -137,7 +137,7 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -179,6 +179,10 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) + def adjust_title(info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -189,8 +193,7 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - if info_dict['is_live']: - info_dict['title'] = self._live_title(info_dict['title']) + adjust_title(info_dict) return info_dict video_id = self._search_regex( @@ -206,8 +209,7 @@ class SVTPlayIE(SVTPlayBaseIE): info_dict['title'] = re.sub( r'\s*\|\s*.+?$', '', info_dict.get('episode') or self._og_search_title(webpage)) - if info_dict['is_live']: - info_dict['title'] = self._live_title(info_dict['title']) + adjust_title(info_dict) return info_dict From 3853309fe238bb709b7c5db261724c33b48a8693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 06:07:32 +0700 Subject: [PATCH 078/488] [youtube:feed] Implement lazy playlist extraction (closes #10184) --- youtube_dl/extractor/youtube.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 617be8e96..e9965509c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2699,10 +2699,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - + def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index ids = [] @@ -2713,12 +2710,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): # 'recommended' feed has infinite 'load more' and each new portion spins # the same videos in (sometimes) slightly different order, so we'll check # for unicity and break when portion has no new videos - new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) if not new_ids: break ids.extend(new_ids) + for entry in self._ids_to_results(new_ids): + yield entry + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break @@ -2730,8 +2730,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] + def _real_extract(self, url): + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + self._PLAYLIST_TITLE) return self.playlist_result( - self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + self._entries(page), playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): From 70d35d166c1cfb14af20fb6d45ed820b6249f941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 06:08:05 +0700 Subject: [PATCH 079/488] [youtube] Add ability to authenticate with cookies --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e9965509c..e7bd1f18f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -87,7 +87,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (username, password) = self._get_login_info() # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: + if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True From 2441c1aab152cd81d53f0a6fca982af9f8c8de10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 00:16:52 +0700 Subject: [PATCH 080/488] [breakcom] Fix extraction (closes #16254) --- youtube_dl/extractor/breakcom.py | 148 ++++++++++--------------------- 1 file changed, 47 insertions(+), 101 deletions(-) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 5a87c2661..70d16767f 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -3,15 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_age_limit, -) +from ..utils import int_or_none class BreakIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<site>break|screenjunkies)\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { @@ -19,125 +17,73 @@ class BreakIE(InfoExtractor): 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', 'age_limit': 13, + }, + }, { + # youtube embed + 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', + 'info_dict': { + 'id': 'RrrDLdeL2HQ', + 'ext': 'mp4', + 'title': 'Whale Watching Boat Crashing Into San Diego Dock', + 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', + 'upload_date': '20160331', + 'uploader': 'Steve Holden', + 'uploader_id': 'sdholden07', + }, + 'params': { + 'skip_download': True, } - }, { - 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', - 'md5': '5c2b686bec3d43de42bde9ec047536b0', - 'info_dict': { - 'id': '2841915', - 'display_id': 'best-quentin-tarantino-movie', - 'ext': 'mp4', - 'title': 'Best Quentin Tarantino Movie', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3671, - 'age_limit': 13, - 'tags': list, - }, - }, { - 'url': 'http://www.screenjunkies.com/video/honest-trailers-the-dark-knight', - 'info_dict': { - 'id': '2348808', - 'display_id': 'honest-trailers-the-dark-knight', - 'ext': 'mp4', - 'title': 'Honest Trailers - The Dark Knight', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'age_limit': 10, - 'tags': list, - }, - }, { - # requires subscription but worked around - 'url': 'http://www.screenjunkies.com/video/knocking-dead-ep-1-the-show-so-far-3003285', - 'info_dict': { - 'id': '3003285', - 'display_id': 'knocking-dead-ep-1-the-show-so-far', - 'ext': 'mp4', - 'title': 'State of The Dead Recap: Knocking Dead Pilot', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3307, - 'age_limit': 13, - 'tags': list, - }, }, { 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', 'only_matching': True, }] - _DEFAULT_BITRATES = (48, 150, 320, 496, 864, 2240, 3264) - def _real_extract(self, url): - site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id, video_id = re.match(self._VALID_URL, url).groups() - if not video_id: - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - (r'src=["\']/embed/(\d+)', r'data-video-content-id=["\'](\d+)'), - webpage, 'video id') + webpage = self._download_webpage(url, display_id) - webpage = self._download_webpage( - 'http://www.%s.com/embed/%s' % (site, video_id), - display_id, 'Downloading video embed page') - embed_vars = self._parse_json( + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + content = self._parse_json( self._search_regex( - r'(?s)embedVars\s*=\s*({.+?})\s*</script>', webpage, 'embed vars'), + r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, + 'content'), display_id) - youtube_id = embed_vars.get('youtubeId') - if youtube_id: - return self.url_result(youtube_id, 'Youtube') - - title = embed_vars['contentName'] - formats = [] - bitrates = [] - for f in embed_vars.get('media', []): - if not f.get('uri') or f.get('mediaPurpose') != 'play': + for video in content: + video_url = video.get('url') + if not video_url or not isinstance(video_url, compat_str): continue - bitrate = int_or_none(f.get('bitRate')) - if bitrate: - bitrates.append(bitrate) + bitrate = int_or_none(self._search_regex( + r'(\d+)_kbps', video_url, 'tbr', default=None)) formats.append({ - 'url': f['uri'], + 'url': video_url, 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), 'tbr': bitrate, - 'format': 'mp4', }) - - if not bitrates: - # When subscriptionLevel > 0, i.e. plus subscription is required - # media list will be empty. However, hds and hls uris are still - # available. We can grab them assuming bitrates to be default. - bitrates = self._DEFAULT_BITRATES - - auth_token = embed_vars.get('AuthToken') - - def construct_manifest_url(base_url, ext): - pieces = [base_url] - pieces.extend([compat_str(b) for b in bitrates]) - pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token)) - return ','.join(pieces) - - if bitrates and auth_token: - hds_url = embed_vars.get('hdsUri') - if hds_url: - formats.extend(self._extract_f4m_formats( - construct_manifest_url(hds_url, 'f4m'), - display_id, f4m_id='hds', fatal=False)) - hls_url = embed_vars.get('hlsUri') - if hls_url: - formats.extend(self._extract_m3u8_formats( - construct_manifest_url(hls_url, 'm3u8'), - display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) + title = self._search_regex( + (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') + + def get(key, name): + return int_or_none(self._search_regex( + r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, + default=None)) + + age_limit = get('ratings', 'age limit') + video_id = video_id or get('pid', 'video id') or display_id + return { 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': embed_vars.get('thumbUri'), - 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None, - 'age_limit': parse_age_limit(embed_vars.get('audienceRating')), - 'tags': embed_vars.get('tags', '').split(','), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': age_limit, 'formats': formats, } From af751350e8651f665333554fa13b335b073fa736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 02:50:11 +0700 Subject: [PATCH 081/488] [Makefile] Add support for pandoc 2 and disable smart extension (closes #16251) smart extension rewrites straight quotes as curly quotes, -- as en-dashes and so on that is unwanted behavior. --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fe247810f..4a62f44bc 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,9 @@ PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi) +# set markdown input format to "markdown-smart" for pandoc version 2 and to "markdown" for pandoc prior to version 2 +MARKDOWN = $(shell if [ `pandoc -v | head -n1 | cut -d" " -f2 | head -c1` = "2" ]; then echo markdown-smart; else echo markdown; fi) + install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) install -m 755 youtube-dl $(DESTDIR)$(BINDIR) @@ -82,11 +85,11 @@ supportedsites: $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md - pandoc -f markdown -t plain README.md -o README.txt + pandoc -f $(MARKDOWN) -t plain README.md -o README.txt youtube-dl.1: README.md $(PYTHON) devscripts/prepare_manpage.py youtube-dl.1.temp.md - pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 + pandoc -s -f $(MARKDOWN) -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in From 171625469ab1b2a4dc99ed173a10be45e7fc13d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 03:17:34 +0700 Subject: [PATCH 082/488] [etonline] Remove extractor (closes #16256) Covered by generic extractor --- youtube_dl/extractor/etonline.py | 39 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 40 deletions(-) delete mode 100644 youtube_dl/extractor/etonline.py diff --git a/youtube_dl/extractor/etonline.py b/youtube_dl/extractor/etonline.py deleted file mode 100644 index 17d7cfec6..000000000 --- a/youtube_dl/extractor/etonline.py +++ /dev/null @@ -1,39 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class ETOnlineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?etonline\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.etonline.com/tv/211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale/', - 'info_dict': { - 'id': '211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale', - 'title': 'md5:a21ec7d3872ed98335cbd2a046f34ee6', - 'description': 'md5:8b94484063f463cca709617c79618ccd', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.etonline.com/media/video/here_are_the_stars_who_love_bringing_their_moms_as_dates_to_the_oscars-211359/', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911076001/default_default/index.html?videoId=ref:%s' - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % video_id, 'BrightcoveNew', video_id) - for video_id in re.findall( - r'site\.brightcove\s*\([^,]+,\s*["\'](title_\d+)', webpage)] - - return self.playlist_result( - entries, playlist_id, - self._og_search_title(webpage, fatal=False), - self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3570fa165..6fb65e4fe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -326,7 +326,6 @@ from .espn import ( FiveThirtyEightIE, ) from .esri import EsriVideoIE -from .etonline import ETOnlineIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE From 99036a1298089068dcf80c0985bfcc3f8c24f281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 04:03:11 +0700 Subject: [PATCH 083/488] [pornflip] Relax _VALID_URL (closes #16258) --- youtube_dl/extractor/pornflip.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py index ee04936e1..025985fbc 100644 --- a/youtube_dl/extractor/pornflip.py +++ b/youtube_dl/extractor/pornflip.py @@ -14,7 +14,7 @@ from ..utils import ( class PornFlipIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P<id>[0-9A-Za-z-]{11})' + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', 'md5': '98c46639849145ae1fd77af532a9278c', @@ -40,6 +40,9 @@ class PornFlipIE(InfoExtractor): }, { 'url': 'https://www.pornflip.com/embed/EkRD6-vS2-s', 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/v/NG9q6Pb_iK8', + 'only_matching': True, }] def _real_extract(self, url): From 1cc47c667419e0eadc0a6989256ab7b276852adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Apr 2018 23:49:30 +0700 Subject: [PATCH 084/488] [utils] Fix match_str for boolean meta fields --- test/test_utils.py | 12 ++++++++++++ youtube_dl/utils.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a1fe6fdb2..253a7fe17 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1072,6 +1072,18 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 10})) + self.assertTrue(match_str('is_live', {'is_live': True})) + self.assertFalse(match_str('is_live', {'is_live': False})) + self.assertFalse(match_str('is_live', {'is_live': None})) + self.assertFalse(match_str('is_live', {})) + self.assertFalse(match_str('!is_live', {'is_live': True})) + self.assertTrue(match_str('!is_live', {'is_live': False})) + self.assertTrue(match_str('!is_live', {'is_live': None})) + self.assertTrue(match_str('!is_live', {})) + self.assertTrue(match_str('title', {'title': 'abc'})) + self.assertTrue(match_str('title', {'title': ''})) + self.assertFalse(match_str('!title', {'title': 'abc'})) + self.assertFalse(match_str('!title', {'title': ''})) def test_parse_dfxp_time_expr(self): self.assertEqual(parse_dfxp_time_expr(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 027d12785..574284e94 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2574,8 +2574,8 @@ def _match_one(filter_part, dct): return op(actual_value, comparison_value) UNARY_OPERATORS = { - '': lambda v: v is not None, - '!': lambda v: v is None, + '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), + '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), } operator_rex = re.compile(r'''(?x)\s* (?P<op>%s)\s*(?P<key>[a-z_]+) From 0ff51adae6feab7386874eddc0d61dbeaf063bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Apr 2018 23:53:01 +0700 Subject: [PATCH 085/488] [twitch] Extract is_live according to status (closes #16259) --- youtube_dl/extractor/twitch.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index f736283e9..4c11fd3c3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -168,6 +168,13 @@ class TwitchItemBaseIE(TwitchBaseIE): return self.playlist_result(entries, info['id'], info['title']) def _extract_info(self, info): + status = info.get('status') + if status == 'recording': + is_live = True + elif status == 'recorded': + is_live = False + else: + is_live = None return { 'id': info['_id'], 'title': info.get('title') or 'Untitled Broadcast', @@ -178,6 +185,7 @@ class TwitchItemBaseIE(TwitchBaseIE): 'uploader_id': info.get('channel', {}).get('name'), 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), + 'is_live': is_live, } def _real_extract(self, url): From 76030543cd5e2214c47aa82f03b3e2cec97e7bc1 Mon Sep 17 00:00:00 2001 From: Alexandre Macabies <Zopieux@users.noreply.github.com> Date: Tue, 24 Apr 2018 19:49:30 +0200 Subject: [PATCH 086/488] [openload] Recognize IPv6 stream URLs (closes #16137) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 650f95656..d0bdd60b8 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -340,7 +340,10 @@ class OpenloadIE(InfoExtractor): get_element_by_id('streamurj', webpage) or self._search_regex( (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', - r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)'), webpage, + r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', + r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, 'stream URL')) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From 5d0fe6d23e4407bee3caec33955d4cb410bebb5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 00:56:16 +0700 Subject: [PATCH 087/488] Credit @Zopieux for #16250 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 6223212aa..880e0abee 100644 --- a/AUTHORS +++ b/AUTHORS @@ -236,3 +236,4 @@ Lei Wang Petr Novák Leonardo Taccari Martin Weinelt +Alexandre Macabies From 95284bc281d8aa3b1d6863ccb536da9d4cf6433c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:01:06 +0700 Subject: [PATCH 088/488] Credit @TingPing for picarto (#15551) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 880e0abee..812051796 100644 --- a/AUTHORS +++ b/AUTHORS @@ -236,4 +236,5 @@ Lei Wang Petr Novák Leonardo Taccari Martin Weinelt +TingPing Alexandre Macabies From ecb24f7c081b764dd669cb4b277d8c14e55b2a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:02:28 +0700 Subject: [PATCH 089/488] Credit @f2face for #16115 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 812051796..eaf96d79d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -236,5 +236,6 @@ Lei Wang Petr Novák Leonardo Taccari Martin Weinelt +Surya Oktafendri TingPing Alexandre Macabies From e028d4f506562a1febf76277795305e296823ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:03:42 +0700 Subject: [PATCH 090/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ChangeLog b/ChangeLog index 185fa1753..a731fde29 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version <unreleased> + +Core +* [utils] Fix match_str for boolean meta fields ++ [Makefile] Add support for pandoc 2 and disable smart extension (#16251) +* [YoutubeDL] Fix typo in media extension compatibility checker (#16215) + +Extractors ++ [openload] Recognize IPv6 stream URLs (#16136, #16137, #16205, #16246, + #16250) ++ [twitch] Extract is_live according to status (#16259) +* [pornflip] Relax URL regular expression (#16258) +- [etonline] Remove extractor (#16256) +* [breakcom] Fix extraction (#16254) ++ [youtube] Add ability to authenticate with cookies +* [youtube:feed] Implement lazy playlist extraction (#10184) ++ [svt] Add support for TV channel live streams (#15279, #15809) +* [ccma] Fix video extraction (#15931) +* [rentv] Fix extraction (#15227) ++ [nick] Add support for nickjr.nl (#16230) +* [extremetube] Fix metadata extraction ++ [keezmovies] Add support for generic embeds (#16134, #16154) +* [nexx] Extract new azure URLs (#16223) +* [cbssports] Fix extraction (#16217) +* [kaltura] Improve embeds detection (#16201) +* [instagram:user] Fix extraction (#16119) +* [cbs] Skip DRM asset types (#16104) + + version 2018.04.16 Extractors From b5802d69f511481a87d8604fa1577bca8370cab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:12:40 +0700 Subject: [PATCH 091/488] release 2018.04.25 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 69f996179..252fa0adf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.16*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.16** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.16 +[debug] youtube-dl version 2018.04.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a731fde29..4a3df67df 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.04.25 Core * [utils] Fix match_str for boolean meta fields diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 715d16cfe..a110f687b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -257,7 +257,6 @@ - **ESPN** - **ESPNArticle** - **EsriVideo** - - **ETOnline** - **Europa** - **EveryonesMixtape** - **ExpoTV** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5aefdd0a2..4e3cb39c6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.16' +__version__ = '2018.04.25' From d3711b00502d9104a3697aba5d210a25066ca756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 02:14:27 +0700 Subject: [PATCH 092/488] [devscripts/gh-pages/generate-download.py] Use program checksum from versions.json --- devscripts/gh-pages/generate-download.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/devscripts/gh-pages/generate-download.py b/devscripts/gh-pages/generate-download.py index fcd7e1dff..a873d32ee 100755 --- a/devscripts/gh-pages/generate-download.py +++ b/devscripts/gh-pages/generate-download.py @@ -1,27 +1,22 @@ #!/usr/bin/env python3 from __future__ import unicode_literals -import hashlib -import urllib.request import json versions_info = json.load(open('update/versions.json')) version = versions_info['latest'] -URL = versions_info['versions'][version]['bin'][0] - -data = urllib.request.urlopen(URL).read() +version_dict = versions_info['versions'][version] # Read template page with open('download.html.in', 'r', encoding='utf-8') as tmplf: template = tmplf.read() -sha256sum = hashlib.sha256(data).hexdigest() template = template.replace('@PROGRAM_VERSION@', version) -template = template.replace('@PROGRAM_URL@', URL) -template = template.replace('@PROGRAM_SHA256SUM@', sha256sum) -template = template.replace('@EXE_URL@', versions_info['versions'][version]['exe'][0]) -template = template.replace('@EXE_SHA256SUM@', versions_info['versions'][version]['exe'][1]) -template = template.replace('@TAR_URL@', versions_info['versions'][version]['tar'][0]) -template = template.replace('@TAR_SHA256SUM@', versions_info['versions'][version]['tar'][1]) +template = template.replace('@PROGRAM_URL@', version_dict['bin'][0]) +template = template.replace('@PROGRAM_SHA256SUM@', version_dict['bin'][1]) +template = template.replace('@EXE_URL@', version_dict['exe'][0]) +template = template.replace('@EXE_SHA256SUM@', version_dict['exe'][1]) +template = template.replace('@TAR_URL@', version_dict['tar'][0]) +template = template.replace('@TAR_SHA256SUM@', version_dict['tar'][1]) with open('download.html', 'w', encoding='utf-8') as dlf: dlf.write(template) From c84eae4f66be8a22c14b852bdb01773bb3807239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 27 Apr 2018 03:45:52 +0700 Subject: [PATCH 093/488] [funk:channel] Improve extraction (closes #16285) --- youtube_dl/extractor/funk.py | 51 ++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index faea6576f..0ff058619 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -5,7 +5,10 @@ import re from .common import InfoExtractor from .nexx import NexxIE -from ..utils import int_or_none +from ..utils import ( + int_or_none, + try_get, +) class FunkBaseIE(InfoExtractor): @@ -77,6 +80,20 @@ class FunkChannelIE(FunkBaseIE): 'params': { 'skip_download': True, }, + }, { + # only available via byIdList API + 'url': 'https://www.funk.net/channel/informr/martin-sonneborn-erklaert-die-eu', + 'info_dict': { + 'id': '205067', + 'ext': 'mp4', + 'title': 'Martin Sonneborn erklärt die EU', + 'description': 'md5:050f74626e4ed87edf4626d2024210c0', + 'timestamp': 1494424042, + 'upload_date': '20170510', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.funk.net/channel/59d5149841dca100012511e3/mein-erster-job-lovemilla-folge-1/lovemilla/', 'only_matching': True, @@ -87,16 +104,28 @@ class FunkChannelIE(FunkBaseIE): channel_id = mobj.group('id') alias = mobj.group('alias') - results = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, - headers={ - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', - 'Referer': url, - }, query={ - 'channelId': channel_id, - 'size': 100, - })['result'] + headers = { + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', + 'Referer': url, + } - video = next(r for r in results if r.get('alias') == alias) + video = None + + by_id_list = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, + headers=headers, query={ + 'ids': alias, + }, fatal=False) + if by_id_list: + video = try_get(by_id_list, lambda x: x['result'][0], dict) + + if not video: + results = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, + headers=headers, query={ + 'channelId': channel_id, + 'size': 100, + })['result'] + video = next(r for r in results if r.get('alias') == alias) return self._make_url_result(video) From 0fe7783eced5c62dbd95780c2150fd1080bd3927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 01:59:15 +0700 Subject: [PATCH 094/488] [extractor/common] Add _download_json_handle --- youtube_dl/extractor/common.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 59b9d3739..e0c3c8eb0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -682,18 +682,30 @@ class InfoExtractor(object): else: self.report_warning(errmsg + str(ve)) - def _download_json(self, url_or_request, video_id, - note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', - transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - json_string = self._download_webpage( + def _download_json_handle( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + """Return a tuple (JSON object, URL handle)""" + res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) - if (not fatal) and json_string is False: - return None + if res is False: + return res + json_string, urlh = res return self._parse_json( - json_string, video_id, transform_source=transform_source, fatal=fatal) + json_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_json( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + res = self._download_json_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query) + return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: From 6cc622327ff8289f94894f3695ed31014c61cf8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 02:47:17 +0700 Subject: [PATCH 095/488] [utils] Introduce merge_dicts --- test/test_utils.py | 12 ++++++++++++ youtube_dl/extractor/generic.py | 16 +--------------- youtube_dl/utils.py | 14 ++++++++++++++ 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 253a7fe17..14503ab53 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -42,6 +42,7 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + merge_dicts, mimetype2ext, month_by_name, multipart_encode, @@ -669,6 +670,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(dict_get(d, ('b', 'c', key, )), None) self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value) + def test_merge_dicts(self): + self.assertEqual(merge_dicts({'a': 1}, {'b': 2}), {'a': 1, 'b': 2}) + self.assertEqual(merge_dicts({'a': 1}, {'a': 2}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': None}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': ''}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {}), {'a': 1}) + self.assertEqual(merge_dicts({'a': None}, {'a': 1}), {'a': 1}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 1}), {'a': ''}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + self.assertEqual(merge_dicts({'a': None}, {'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + def test_encode_compat_str(self): self.assertEqual(encode_compat_str(b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', 'utf-8'), 'тест') self.assertEqual(encode_compat_str('тест', 'utf-8'), 'тест') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index af1322e00..d48914495 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -23,6 +23,7 @@ from ..utils import ( is_html, js_to_json, KNOWN_EXTENSIONS, + merge_dicts, mimetype2ext, orderedSet, sanitized_Request, @@ -3002,21 +3003,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( sharevideos_urls, video_id, video_title) - def merge_dicts(dict1, dict2): - merged = {} - for k, v in dict1.items(): - if v is not None: - merged[k] = v - for k, v in dict2.items(): - if v is None: - continue - if (k not in merged or - (isinstance(v, compat_str) and v and - isinstance(merged[k], compat_str) and - not merged[k])): - merged[k] = v - return merged - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 574284e94..b460393bf 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2225,6 +2225,20 @@ def try_get(src, getter, expected_type=None): return v +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if v is None: + continue + if (k not in merged or + (isinstance(v, compat_str) and v and + isinstance(merged[k], compat_str) and + not merged[k])): + merged[k] = v + return merged + + def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) From e7e4a6e0f9166cee82c165ca69a6a3c94ddc5f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 02:48:03 +0700 Subject: [PATCH 096/488] [extractor/common] Extract interaction statistic --- youtube_dl/extractor/common.py | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e0c3c8eb0..a9939b0fd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1020,6 +1020,40 @@ class InfoExtractor(object): if isinstance(json_ld, dict): json_ld = [json_ld] + INTERACTION_TYPE_MAP = { + 'CommentAction': 'comment', + 'AgreeAction': 'like', + 'DisagreeAction': 'dislike', + 'LikeAction': 'like', + 'DislikeAction': 'dislike', + 'ListenAction': 'view', + 'WatchAction': 'view', + 'ViewAction': 'view', + } + + def extract_interaction_statistic(e): + interaction_statistic = e.get('interactionStatistic') + if not isinstance(interaction_statistic, list): + return + for is_e in interaction_statistic: + if not isinstance(is_e, dict): + continue + if is_e.get('@type') != 'InteractionCounter': + continue + interaction_type = is_e.get('interactionType') + if not isinstance(interaction_type, compat_str): + continue + interaction_count = int_or_none(is_e.get('userInteractionCount')) + if interaction_count is None: + continue + count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) + if not count_kind: + continue + count_key = '%s_count' % count_kind + if info.get(count_key) is not None: + continue + info[count_key] = interaction_count + def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ @@ -1035,6 +1069,7 @@ class InfoExtractor(object): 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), }) + extract_interaction_statistic(e) for e in json_ld: if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): From ae1c585cee3eb183cddf7c30a09b75d887307dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 02:48:20 +0700 Subject: [PATCH 097/488] [vimeo] Extract JSON LD (closes #16295) --- youtube_dl/extractor/vimeo.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 08257147e..a026526b2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,6 +16,7 @@ from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, + merge_dicts, NO_DEFAULT, RegexNotFoundError, sanitized_Request, @@ -639,16 +640,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'preference': 1, }) - info_dict = self._parse_config(config, video_id) - formats.extend(info_dict['formats']) + info_dict_config = self._parse_config(config, video_id) + formats.extend(info_dict_config['formats']) self._vimeo_sort_formats(formats) + json_ld = self._search_json_ld(webpage, video_id, default={}) + if not cc_license: cc_license = self._search_regex( r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - info_dict.update({ + info_dict = { 'id': video_id, 'formats': formats, 'timestamp': unified_timestamp(timestamp), @@ -658,7 +661,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, - }) + } + + info_dict = merge_dicts(info_dict, info_dict_config, json_ld) return info_dict From 7dd6ab4a47b08beafe45befa29c44df2db00547e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 04:51:39 +0700 Subject: [PATCH 098/488] [imdb] Extract all formats (closes #16249) --- youtube_dl/extractor/imdb.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 3ff672a89..425421968 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, mimetype2ext, qualities, remove_end, @@ -73,19 +75,25 @@ class ImdbIE(InfoExtractor): video_info_list = format_info.get('videoInfoList') if not video_info_list or not isinstance(video_info_list, list): continue - video_info = video_info_list[0] - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url: - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + for video_info in video_info_list: + if not video_info or not isinstance(video_info, dict): + continue + video_url = video_info.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if (video_info.get('videoMimeType') == 'application/x-mpegURL' or + determine_ext(video_url) == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + format_id = format_info.get('ffname') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': mimetype2ext(video_info.get('videoMimeType')), + 'quality': quality(format_id), + }) self._sort_formats(formats) return { From 500a86a52ee46a3a1acc864b602b74d141afdc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 00:33:31 +0700 Subject: [PATCH 099/488] [downloader/fragment] Restart download if .ytdl file is corrupt (closes #16312) --- youtube_dl/downloader/fragment.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 927c7e491..917f6dc01 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -74,9 +74,14 @@ class FragmentFD(FileDownloader): return not ctx['live'] and not ctx['tmpfilename'] == '-' def _read_ytdl_file(self, ctx): + assert 'ytdl_corrupt' not in ctx stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') - ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] - stream.close() + try: + ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] + except Exception: + ctx['ytdl_corrupt'] = True + finally: + stream.close() def _write_ytdl_file(self, ctx): frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') @@ -158,11 +163,17 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) - if ctx['fragment_index'] > 0 and resume_len == 0: + is_corrupt = ctx.get('ytdl_corrupt') is True + is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 + if is_corrupt or is_inconsistent: + message = ( + '.ytdl file is corrupt' if is_corrupt else + 'Inconsistent state of incomplete fragment download') self.report_warning( - 'Inconsistent state of incomplete fragment download. ' - 'Restarting from the beginning...') + '%s. Restarting from the beginning...' % message) ctx['fragment_index'] = resume_len = 0 + if 'ytdl_corrupt' in ctx: + del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) From 106c8c3edbc5b7e95cfba79ddc6252fad0adb859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 19:04:40 +0700 Subject: [PATCH 100/488] [nrktv] Update API host (closes #16324) --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 18ead9426..3b4f51f61 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -237,7 +237,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-ne.nrk.no' + _API_HOST = 'psapi-we.nrk.no' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', From 12b0d4e0e1df6d6a8b9ce10b9a69013497adc2b0 Mon Sep 17 00:00:00 2001 From: Meneth32 <meneth@hotmail.com> Date: Sun, 29 Apr 2018 16:59:40 +0200 Subject: [PATCH 101/488] [redditr] Add support for old.reddit.com URLs --- youtube_dl/extractor/reddit.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 53b1c967e..8372925be 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -47,7 +47,7 @@ class RedditIE(InfoExtractor): class RedditRIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' + _VALID_URL = r'(?P<url>https?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -74,6 +74,10 @@ class RedditRIE(InfoExtractor): # imgur 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, }, { # streamable 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', From 01aec8488084e62aa188b5167e57d01ef66cd256 Mon Sep 17 00:00:00 2001 From: Bastian de Groot <bastiandg@users.noreply.github.com> Date: Sun, 29 Apr 2018 17:14:37 +0200 Subject: [PATCH 102/488] [generic] Prefer enclosures over links in RSS feeds --- youtube_dl/extractor/generic.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d48914495..252f97c26 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -191,6 +191,16 @@ class GenericIE(InfoExtractor): 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, + # RSS feed with enclosures and unsupported link URLs + { + 'url': 'http://www.hellointernet.fm/podcast?format=rss', + 'info_dict': { + 'id': 'http://www.hellointernet.fm/podcast?format=rss', + 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', + 'title': 'Hello Internet', + }, + 'playlist_mincount': 100, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -2026,13 +2036,15 @@ class GenericIE(InfoExtractor): entries = [] for it in doc.findall('./channel/item'): - next_url = xpath_text(it, 'link', fatal=False) + next_url = None + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + if not next_url: - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break + next_url = xpath_text(it, 'link', fatal=False) if not next_url: continue From 30226342ab346263b684170c4ce7d5266fec212e Mon Sep 17 00:00:00 2001 From: Niklas Haas <git@haasn.xyz> Date: Sun, 29 Apr 2018 11:23:23 +0200 Subject: [PATCH 103/488] [youtube] Correctly disable polymer on all requests Rather than just the one that use the _download_webpage helper. The need for this was made apparent by 0fe7783e, which refactored _download_json in a way that completely avoids the use of _download_webpage, thus breaking youtube. Fixes #16323 --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e7bd1f18f..04aeb91af 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -246,9 +246,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True - def _download_webpage(self, *args, **kwargs): + def _download_webpage_handle(self, *args, **kwargs): kwargs.setdefault('query', {})['disable_polymer'] = 'true' - return super(YoutubeBaseInfoExtractor, self)._download_webpage( + return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) def _real_initialize(self): From e5eadfa82f10bda43294d1da85024eec29c7973f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 22:49:47 +0700 Subject: [PATCH 104/488] [udemy,xiami,yandexmusic] Override _download_webpage_handle instead of _download_webpage --- youtube_dl/extractor/udemy.py | 4 ++-- youtube_dl/extractor/xiami.py | 4 ++-- youtube_dl/extractor/yandexmusic.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 6d6c0a98f..439ed2a89 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -115,9 +115,9 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) - def _download_webpage(self, *args, **kwargs): + def _download_webpage_handle(self, *args, **kwargs): kwargs.setdefault('headers', {})['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' - return super(UdemyIE, self)._download_webpage( + return super(UdemyIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) def _download_json(self, url_or_request, *args, **kwargs): diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index 7f871c8ec..8333fb534 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -9,8 +9,8 @@ from ..utils import int_or_none class XiamiBaseIE(InfoExtractor): _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' - def _download_webpage(self, *args, **kwargs): - webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(XiamiBaseIE, self)._download_webpage_handle(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') return webpage diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index eb1062142..e85eca073 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -34,8 +34,8 @@ class YandexMusicBaseIE(InfoExtractor): 'youtube-dl with --cookies', expected=True) - def _download_webpage(self, *args, **kwargs): - webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: self._raise_captcha() return webpage From 796bf9de45d6f01bf2d34ae22e1eacdc1a649fab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 22:56:07 +0700 Subject: [PATCH 105/488] [yandexmusic] Convert release_year to int --- youtube_dl/extractor/yandexmusic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index e85eca073..009203851 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -57,14 +57,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio - Gypsy Eyes 1', + 'title': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, 'track': 'Gypsy Eyes 1', 'album': 'Gypsy Soul', 'album_artist': 'Carlo Ambrosio', - 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', - 'release_year': '2009', + 'artist': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari', + 'release_year': 2009, }, 'skip': 'Travis CI servers blocked by YandexMusic', } @@ -120,7 +120,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): track_info.update({ 'album': album.get('title'), 'album_artist': extract_artist(album.get('artists')), - 'release_year': compat_str(year) if year else None, + 'release_year': int_or_none(year), }) track_artist = extract_artist(track.get('artists')) From 4a733545867a014eb786348f8fb9e6ae95850742 Mon Sep 17 00:00:00 2001 From: Alex Seiler <seileralex@gmail.com> Date: Sun, 5 Nov 2017 18:07:35 +0100 Subject: [PATCH 106/488] [zattoo] Add extractor (closes #14668) --- youtube_dl/extractor/extractors.py | 6 + youtube_dl/extractor/zattoo.py | 234 +++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 youtube_dl/extractor/zattoo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6fb65e4fe..9fe3f649d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1418,5 +1418,11 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zaq1 import Zaq1IE +from .zattoo import ( + QuicklineIE, + QuicklineLiveIE, + ZattooIE, + ZattooLiveIE, +) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py new file mode 100644 index 000000000..928f22566 --- /dev/null +++ b/youtube_dl/extractor/zattoo.py @@ -0,0 +1,234 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from uuid import uuid4 +import re + +from .common import InfoExtractor +from ..utils import ( + compat_str, + ExtractorError, + sanitized_Request, + urlencode_postdata, +) + + +class ZattooBaseIE(InfoExtractor): + + _NETRC_MACHINE = 'zattoo' + _HOST_URL = 'https://zattoo.com' + + _power_guide_hash = None + + def _login(self, uuid, session_id): + (username, password) = self._get_login_info() + if not username or not password: + raise ExtractorError( + 'A valid %s account is needed to access this media.' % self._NETRC_MACHINE, + expected=True) + login_form = { + 'login': username, + 'password': password, + 'remember': True, + } + request = sanitized_Request( + '%s/zapi/v2/account/login' % self._HOST_URL, + urlencode_postdata(login_form)) + request.add_header( + 'Referer', '%s/login' % self._HOST_URL) + request.add_header( + 'Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') + request.add_header( + 'Cookie', 'uuid=%s; beaker.session.id=%s' % (uuid, session_id)) + response = self._request_webpage( + request, None, 'Logging in') + data = self._parse_json(response.read(), None) + return data['session']['power_guide_hash'] + + def _get_app_token_and_version(self): + host_webpage = self._download_webpage( + self._HOST_URL, None, 'Downloading %s' % self._HOST_URL) + app_token = self._html_search_regex( + r'<script.+window\.appToken\s*=\s*\'(.+)\'', host_webpage, 'app token') + app_version = self._html_search_regex( + r'<!--\w+-(.+?)-', host_webpage, 'app version', default='2.8.2') + return app_token, app_version + + def _say_hello(self, uuid, app_token, app_version): + postdata = { + 'client_app_token': app_token, + 'uuid': uuid, + 'lang': 'en', + 'app_version': app_version, + 'format': 'json', + } + request = sanitized_Request( + '%s/zapi/v2/session/hello' % self._HOST_URL, + urlencode_postdata(postdata)) + response = self._request_webpage( + request, None, 'Say hello') + + cookie = response.headers.get('Set-Cookie') + session_id = self._search_regex( + r'beaker\.session\.id\s*=\s*(.+?);', cookie, 'session id') + return session_id + + def _extract_cid(self, video_id, channel_name): + channel_groups = self._download_json( + '%s/zapi/v2/cached/channels/%s' % (self._HOST_URL, + self._power_guide_hash), + video_id, + 'Downloading available channel list', + query={'details': False})['channel_groups'] + channel_list = [] + for chgrp in channel_groups: + channel_list.extend(chgrp['channels']) + try: + return next( + chan['cid'] for chan in channel_list + if chan['display_alias'] == channel_name or chan['cid'] == channel_name) + except StopIteration: + raise ExtractorError('Could not extract channel id') + + def _extract_cid_and_video_info(self, video_id): + data = self._download_json( + '%s/zapi/program/details' % self._HOST_URL, + video_id, + 'Downloading video information', + query={ + 'program_id': video_id, + 'complete': True + }) + + info_dict = { + 'id': video_id, + 'title': data['program']['title'], + 'description': data['program'].get('description'), + 'thumbnail': data['program'].get('image_url') + } + cid = data['program']['cid'] + return cid, info_dict + + def _extract_formats(self, cid, video_id, record_id=None, is_live=False): + postdata = { + 'stream_type': 'dash', + 'https_watch_urls': True, + } + if record_id: + url = '%s/zapi/watch/recording/%s' % (self._HOST_URL, record_id) + else: + url = '%s/zapi/watch/recall/%s/%s' % (self._HOST_URL, cid, video_id) + + if is_live: + postdata.update({'timeshift': 10800}) + url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) + + data = self._download_json( + sanitized_Request(url, urlencode_postdata(postdata)), + video_id, 'Downloading dash formats') + + formats = [] + for elem in data['stream']['watch_urls']: + audio_channel = elem.get('audio_channel') + maxrate = elem.get('maxrate') + formats.extend( + self._extract_mpd_formats( + elem['url'], video_id, + mpd_id='dash-maxrate-%s-channel-%s' % (maxrate, audio_channel), fatal=False)) + + postdata.update({'stream_type': 'hls'}) + request = sanitized_Request( + url, urlencode_postdata(postdata)) + data = self._download_json( + request, video_id, 'Downloading hls formats') + for elem in data['stream']['watch_urls']: + audio_channel = elem.get('audio_channel') + preference = None + + # Prefer audio channel A: + if audio_channel == 'A': + preference = 1 + + maxrate = elem.get('maxrate') + formats.extend( + self._extract_m3u8_formats( + elem['url'], video_id, 'mp4', entry_protocol='m3u8_native', + preference=preference, + m3u8_id='hls-maxrate-%s-channel-%s' % (maxrate, audio_channel), + fatal=False)) + + self._sort_formats(formats) + return formats + + def _real_initialize(self): + uuid = compat_str(uuid4()) + app_token, app_version = self._get_app_token_and_version() + session_id = self._say_hello(uuid, app_token, app_version) + self._power_guide_hash = self._login(uuid, session_id) + + def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): + if is_live: + cid = self._extract_cid(video_id, channel_name) + info_dict = { + 'id': channel_name, + 'title': self._live_title(channel_name), + 'is_live': True, + } + else: + cid, info_dict = self._extract_cid_and_video_info(video_id) + formats = self._extract_formats( + cid, video_id, record_id=record_id, is_live=is_live) + info_dict['formats'] = formats + return info_dict + + +class QuicklineBaseIE(ZattooBaseIE): + _NETRC_MACHINE = 'quickline' + _HOST_URL = 'https://mobiltv.quickline.com' + + +class QuicklineIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' + + def _real_extract(self, url): + channel_name, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id) + + +class QuicklineLiveIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)$' + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class ZattooIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + + # Since regular videos are only available for 7 days and recorded videos + # are only available for a specific user, we cannot have detailed tests. + _TESTS = [{ + 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id, record_id) + + +class ZattooLiveIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)$' + + _TEST = { + 'url': 'https://zattoo.com/watch/srf1', + 'only_matching': True, + } + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) From 67ca1a8ef7ea6094e1e34518b93cdb5ba59f31b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 01:48:21 +0700 Subject: [PATCH 107/488] [zattoo] Improve and simplify (closes #14676) --- youtube_dl/extractor/zattoo.py | 238 +++++++++++++++++++-------------- 1 file changed, 137 insertions(+), 101 deletions(-) diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 928f22566..773073d85 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -1,84 +1,82 @@ # coding: utf-8 from __future__ import unicode_literals -from uuid import uuid4 import re +from uuid import uuid4 from .common import InfoExtractor -from ..utils import ( +from ..compat import ( + compat_HTTPError, compat_str, +) +from ..utils import ( ExtractorError, - sanitized_Request, + int_or_none, + try_get, urlencode_postdata, ) class ZattooBaseIE(InfoExtractor): - _NETRC_MACHINE = 'zattoo' _HOST_URL = 'https://zattoo.com' _power_guide_hash = None - def _login(self, uuid, session_id): + def _login(self): (username, password) = self._get_login_info() if not username or not password: - raise ExtractorError( - 'A valid %s account is needed to access this media.' % self._NETRC_MACHINE, - expected=True) - login_form = { - 'login': username, - 'password': password, - 'remember': True, - } - request = sanitized_Request( - '%s/zapi/v2/account/login' % self._HOST_URL, - urlencode_postdata(login_form)) - request.add_header( - 'Referer', '%s/login' % self._HOST_URL) - request.add_header( - 'Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') - request.add_header( - 'Cookie', 'uuid=%s; beaker.session.id=%s' % (uuid, session_id)) - response = self._request_webpage( - request, None, 'Logging in') - data = self._parse_json(response.read(), None) - return data['session']['power_guide_hash'] + self.raise_login_required( + 'A valid %s account is needed to access this media.' + % self._NETRC_MACHINE) - def _get_app_token_and_version(self): - host_webpage = self._download_webpage( - self._HOST_URL, None, 'Downloading %s' % self._HOST_URL) + try: + data = self._download_json( + '%s/zapi/v2/account/login' % self._HOST_URL, None, 'Logging in', + data=urlencode_postdata({ + 'login': username, + 'password': password, + 'remember': 'true', + }), headers={ + 'Referer': '%s/login' % self._HOST_URL, + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError( + 'Unable to login: incorrect username and/or password', + expected=True) + raise + + self._power_guide_hash = data['session']['power_guide_hash'] + + def _real_initialize(self): + webpage = self._download_webpage( + self._HOST_URL, None, 'Downloading app token') app_token = self._html_search_regex( - r'<script.+window\.appToken\s*=\s*\'(.+)\'', host_webpage, 'app token') + r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', + webpage, 'app token', group='token') app_version = self._html_search_regex( - r'<!--\w+-(.+?)-', host_webpage, 'app version', default='2.8.2') - return app_token, app_version + r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2') - def _say_hello(self, uuid, app_token, app_version): - postdata = { - 'client_app_token': app_token, - 'uuid': uuid, - 'lang': 'en', - 'app_version': app_version, - 'format': 'json', - } - request = sanitized_Request( - '%s/zapi/v2/session/hello' % self._HOST_URL, - urlencode_postdata(postdata)) - response = self._request_webpage( - request, None, 'Say hello') + # Will setup appropriate cookies + self._request_webpage( + '%s/zapi/v2/session/hello' % self._HOST_URL, None, + 'Opening session', data=urlencode_postdata({ + 'client_app_token': app_token, + 'uuid': compat_str(uuid4()), + 'lang': 'en', + 'app_version': app_version, + 'format': 'json', + })) - cookie = response.headers.get('Set-Cookie') - session_id = self._search_regex( - r'beaker\.session\.id\s*=\s*(.+?);', cookie, 'session id') - return session_id + self._login() def _extract_cid(self, video_id, channel_name): channel_groups = self._download_json( '%s/zapi/v2/cached/channels/%s' % (self._HOST_URL, self._power_guide_hash), - video_id, - 'Downloading available channel list', + video_id, 'Downloading channel list', query={'details': False})['channel_groups'] channel_list = [] for chgrp in channel_groups: @@ -86,7 +84,9 @@ class ZattooBaseIE(InfoExtractor): try: return next( chan['cid'] for chan in channel_list - if chan['display_alias'] == channel_name or chan['cid'] == channel_name) + if chan.get('cid') and ( + chan.get('display_alias') == channel_name or + chan.get('cid') == channel_name)) except StopIteration: raise ExtractorError('Could not extract channel id') @@ -100,72 +100,90 @@ class ZattooBaseIE(InfoExtractor): 'complete': True }) + p = data['program'] + cid = p['cid'] + info_dict = { 'id': video_id, - 'title': data['program']['title'], - 'description': data['program'].get('description'), - 'thumbnail': data['program'].get('image_url') + 'title': p.get('title') or p['episode_title'], + 'description': p.get('description'), + 'thumbnail': p.get('image_url'), + 'creator': p.get('channel_name'), + 'episode': p.get('episode_title'), + 'episode_number': int_or_none(p.get('episode_number')), + 'season_number': int_or_none(p.get('season_number')), + 'release_year': int_or_none(p.get('year')), + 'categories': try_get(p, lambda x: x['categories'], list), } - cid = data['program']['cid'] + return cid, info_dict def _extract_formats(self, cid, video_id, record_id=None, is_live=False): - postdata = { - 'stream_type': 'dash', + postdata_common = { 'https_watch_urls': True, } - if record_id: + + if is_live: + postdata_common.update({'timeshift': 10800}) + url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) + elif record_id: url = '%s/zapi/watch/recording/%s' % (self._HOST_URL, record_id) else: url = '%s/zapi/watch/recall/%s/%s' % (self._HOST_URL, cid, video_id) - if is_live: - postdata.update({'timeshift': 10800}) - url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) - - data = self._download_json( - sanitized_Request(url, urlencode_postdata(postdata)), - video_id, 'Downloading dash formats') - formats = [] - for elem in data['stream']['watch_urls']: - audio_channel = elem.get('audio_channel') - maxrate = elem.get('maxrate') - formats.extend( - self._extract_mpd_formats( - elem['url'], video_id, - mpd_id='dash-maxrate-%s-channel-%s' % (maxrate, audio_channel), fatal=False)) + for stream_type in ('dash', 'hls', 'hls5', 'hds'): + postdata = postdata_common.copy() + postdata['stream_type'] = stream_type - postdata.update({'stream_type': 'hls'}) - request = sanitized_Request( - url, urlencode_postdata(postdata)) - data = self._download_json( - request, video_id, 'Downloading hls formats') - for elem in data['stream']['watch_urls']: - audio_channel = elem.get('audio_channel') - preference = None + data = self._download_json( + url, video_id, 'Downloading %s formats' % stream_type.upper(), + data=urlencode_postdata(postdata), fatal=False) + if not data: + continue - # Prefer audio channel A: - if audio_channel == 'A': - preference = 1 - - maxrate = elem.get('maxrate') - formats.extend( - self._extract_m3u8_formats( - elem['url'], video_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, - m3u8_id='hls-maxrate-%s-channel-%s' % (maxrate, audio_channel), - fatal=False)) + watch_urls = try_get( + data, lambda x: x['stream']['watch_urls'], list) + if not watch_urls: + continue + for watch in watch_urls: + if not isinstance(watch, dict): + continue + watch_url = watch.get('url') + if not watch_url or not isinstance(watch_url, compat_str): + continue + format_id_list = [stream_type] + maxrate = watch.get('maxrate') + if maxrate: + format_id_list.append(compat_str(maxrate)) + audio_channel = watch.get('audio_channel') + if audio_channel: + format_id_list.append(compat_str(audio_channel)) + preference = 1 if audio_channel == 'A' else None + format_id = '-'.join(format_id_list) + if stream_type in ('dash', 'dash_widevine', 'dash_playready'): + this_formats = self._extract_mpd_formats( + watch_url, video_id, mpd_id=format_id, fatal=False) + elif stream_type in ('hls', 'hls5', 'hls5_fairplay'): + this_formats = self._extract_m3u8_formats( + watch_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False) + elif stream_type == 'hds': + this_formats = self._extract_f4m_formats( + watch_url, video_id, f4m_id=format_id, fatal=False) + elif stream_type == 'smooth_playready': + this_formats = self._extract_ism_formats( + watch_url, video_id, ism_id=format_id, fatal=False) + else: + assert False + for this_format in this_formats: + this_format['preference'] = preference + formats.extend(this_formats) self._sort_formats(formats) return formats - def _real_initialize(self): - uuid = compat_str(uuid4()) - app_token, app_version = self._get_app_token_and_version() - session_id = self._say_hello(uuid, app_token, app_version) - self._power_guide_hash = self._login(uuid, session_id) - def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): if is_live: cid = self._extract_cid(video_id, channel_name) @@ -190,13 +208,27 @@ class QuicklineBaseIE(ZattooBaseIE): class QuicklineIE(QuicklineBaseIE): _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + } + def _real_extract(self, url): channel_name, video_id = re.match(self._VALID_URL, url).groups() return self._extract_video(channel_name, video_id) class QuicklineLiveIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)' + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url) def _real_extract(self, url): channel_name = video_id = self._match_id(url) @@ -222,13 +254,17 @@ class ZattooIE(ZattooBaseIE): class ZattooLiveIE(ZattooBaseIE): - _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)' _TEST = { 'url': 'https://zattoo.com/watch/srf1', 'only_matching': True, } + @classmethod + def suitable(cls, url): + return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url) + def _real_extract(self, url): channel_name = video_id = self._match_id(url) return self._extract_video(channel_name, video_id, is_live=True) From 851396346803f77ab9573af56cae056aa904cf93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 02:15:43 +0700 Subject: [PATCH 108/488] [udemy] Extract outputs renditions (closes #16289, closes #16291, closes #16320, closes #16321, closes #16334, closes #16335) --- youtube_dl/extractor/udemy.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 439ed2a89..bf1134e3f 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -58,6 +58,10 @@ class UdemyIE(InfoExtractor): # no url in outputs format entry 'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', 'only_matching': True, + }, { + # only outputs rendition + 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', + 'only_matching': True, }] def _extract_course_info(self, webpage, video_id): @@ -357,6 +361,12 @@ class UdemyIE(InfoExtractor): fatal=False) extract_subtitles(text_tracks) + if not formats and outputs: + for format_id, output in outputs.items(): + f = extract_output_format(output, format_id) + if f.get('url'): + formats.append(f) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { From c21692fa94df49ef925c06c00e5db1d8bb0f770d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 03:09:04 +0700 Subject: [PATCH 109/488] [kaltura] Improve iframe embeds detection (closes #16337) --- youtube_dl/extractor/generic.py | 17 +++++++++++++++++ youtube_dl/extractor/kaltura.py | 3 ++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 252f97c26..73980601c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1282,6 +1282,23 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura iframe embed, more sophisticated + 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', + 'info_dict': { + 'id': '1_9gzouybz', + 'ext': 'mp4', + 'title': 'lecture-05sep2017', + 'description': 'md5:40f347d91fd4ba047e511c5321064b49', + 'upload_date': '20170913', + 'uploader_id': 'eps2', + 'timestamp': 1505340777, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, { # meta twitter:player 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 0ea89e4d6..04f68fce4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -136,9 +136,10 @@ class KalturaIE(InfoExtractor): re.search( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["']) - (?:https?:)?//(?:(?:www|cdnapi)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) + (?:(?!(?P=q1)).)* (?P=q1) ''', webpage) ) From cc5772c4f0bcb7dfdfb0575787ff124dd7376de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 03:30:23 +0700 Subject: [PATCH 110/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4a3df67df..7841ee765 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,29 @@ +version <unreleased> + +Core +* [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) ++ [extractor/common] Extract interaction statistic ++ [utils] Add merge_dicts ++ [extractor/common] Add _download_json_handle + +Extractors +* [kaltura] Improve iframe embeds detection (#16337) ++ [udemy] Extract outputs renditions (#16289, #16291, #16320, #16321, #16334, + #16335) ++ [zattoo] Add support for zattoo.com and mobiltv.quickline.com (#14668, #14676) +* [yandexmusic] Convert release_year to int +* [udemy] Override _download_webpage_handle instead of _download_webpage +* [xiami] Override _download_webpage_handle instead of _download_webpage +* [yandexmusic] Override _download_webpage_handle instead of _download_webpage +* [youtube] Correctly disable polymer on all requests (#16323, #16326) +* [generic] Prefer enclosures over links in RSS feeds (#16189) ++ [redditr] Add support for old.reddit.com URLs (#16274) +* [nrktv] Update API host (#16324) ++ [imdb] Extract all formats (#16249) ++ [vimeo] Extract JSON-LD (#16295) +* [funk:channel] Improve extraction (#16285) + + version 2018.04.25 Core From cc42941390b547ba950b4e76f4950be801f96134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 03:38:57 +0700 Subject: [PATCH 111/488] release 2018.05.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 4 ++++ youtube_dl/version.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 252fa0adf..c2bd5d8ae 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.25** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.25 +[debug] youtube-dl version 2018.05.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7841ee765..916b8edb8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.05.01 Core * [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a110f687b..c5a48002b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -667,6 +667,8 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **Quickline** + - **QuicklineLive** - **R7** - **R7Article** - **radio.de** @@ -1092,6 +1094,8 @@ - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **Zapiks** - **Zaq1** + - **Zattoo** + - **ZattooLive** - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4e3cb39c6..04896efc8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.25' +__version__ = '2018.05.01' From c18142da6e0e99a7b4c9ab488ddb285ad1e8dad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 22:46:06 +0700 Subject: [PATCH 112/488] [itv] Improve extraction (closes #16253) --- youtube_dl/extractor/itv.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 18a7d7f8c..457b424a2 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -41,6 +41,14 @@ class ITVIE(InfoExtractor): # unavailable via data-playlist-url 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', 'only_matching': True, + }, { + # InvalidVodcrid + 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034', + 'only_matching': True, + }, { + # ContentUnavailable + 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024', + 'only_matching': True, }] def _real_extract(self, url): @@ -127,7 +135,8 @@ class ITVIE(InfoExtractor): if fault_code == 'InvalidGeoRegion': self.raise_geo_restricted( msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code != 'InvalidEntity': + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): raise ExtractorError( '%s said: %s' % (self.IE_NAME, fault_string), expected=True) info.update({ From a93ce61bd5cbe7779e4eff0f8ab74a8a02211285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 2 May 2018 01:29:44 +0700 Subject: [PATCH 113/488] [tunein] Use live title for live streams (closes #16347) --- youtube_dl/extractor/tunein.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 7e51de89e..c7a5f5a63 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -62,7 +62,7 @@ class TuneInBaseIE(InfoExtractor): return { 'id': content_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, From 5f95927a62a533b9e616abb5f1481cedeaa16a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 2 May 2018 07:18:01 +0700 Subject: [PATCH 114/488] Improve geo bypass mechanism * Introduce geo bypass context * Add ability to bypass based on IP blocks in CIDR notation * Introduce --geo-bypass-ip-block --- youtube_dl/YoutubeDL.py | 3 + youtube_dl/__init__.py | 1 + youtube_dl/extractor/anvato.py | 4 +- youtube_dl/extractor/brightcove.py | 5 +- youtube_dl/extractor/common.py | 97 ++++++++++++++++++++++++------ youtube_dl/extractor/dplay.py | 4 +- youtube_dl/extractor/go.py | 2 +- youtube_dl/extractor/limelight.py | 4 +- youtube_dl/extractor/tvplay.py | 6 +- youtube_dl/options.py | 4 ++ youtube_dl/utils.py | 11 ++-- 11 files changed, 113 insertions(+), 28 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ad3598805..f1a359011 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -286,6 +286,9 @@ class YoutubeDL(object): Two-letter ISO 3166-2 country code that will be used for explicit geographic restriction bypassing via faking X-Forwarded-For HTTP header (experimental) + geo_bypass_ip_block: + IP range in CIDR notation that will be used similarly to + geo_bypass_country (experimental) The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9bb952457..ba435ea42 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -430,6 +430,7 @@ def _real_main(argv=None): 'config_location': opts.config_location, 'geo_bypass': opts.geo_bypass, 'geo_bypass_country': opts.geo_bypass_country, + 'geo_bypass_ip_block': opts.geo_bypass_ip_block, # just for deprecation check 'autonumber': opts.autonumber if opts.autonumber is True else None, 'usetitle': opts.usetitle if opts.usetitle is True else None, diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 7a29cd2c6..f6a78eb5d 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -277,7 +277,9 @@ class AnvatoIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) mobj = re.match(self._VALID_URL, url) access_key, video_id = mobj.group('access_key_or_mcp', 'id') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0e4eaef65..ab62e54d6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -669,7 +669,10 @@ class BrightcoveNewIE(AdobePassIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + 'ip_blocks': smuggled_data.get('geo_ip_blocks'), + }) account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a9939b0fd..3ef5af13c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -346,6 +346,11 @@ class InfoExtractor(object): geo restriction bypass mechanism right away in order to bypass geo restriction, of course, if the mechanism is not disabled. (experimental) + _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted + IP blocks in CIDR notation for this extractor. One of these IP blocks + will be used by geo restriction bypass mechanism similarly + to _GEO_COUNTRIES. (experimental) + NB: both these geo attributes are experimental and may change in future or be completely removed. @@ -358,6 +363,7 @@ class InfoExtractor(object): _x_forwarded_for_ip = None _GEO_BYPASS = True _GEO_COUNTRIES = None + _GEO_IP_BLOCKS = None _WORKING = True def __init__(self, downloader=None): @@ -392,12 +398,15 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" - self._initialize_geo_bypass(self._GEO_COUNTRIES) + self._initialize_geo_bypass({ + 'countries': self._GEO_COUNTRIES, + 'ip_blocks': self._GEO_IP_BLOCKS, + }) if not self._ready: self._real_initialize() self._ready = True - def _initialize_geo_bypass(self, countries): + def _initialize_geo_bypass(self, geo_bypass_context): """ Initialize geo restriction bypass mechanism. @@ -408,28 +417,82 @@ class InfoExtractor(object): HTTP requests. This method will be used for initial geo bypass mechanism initialization - during the instance initialization with _GEO_COUNTRIES. + during the instance initialization with _GEO_COUNTRIES and + _GEO_IP_BLOCKS. - You may also manually call it from extractor's code if geo countries + You may also manually call it from extractor's code if geo bypass information is not available beforehand (e.g. obtained during - extraction) or due to some another reason. + extraction) or due to some other reason. In this case you should pass + this information in geo bypass context passed as first argument. It may + contain following fields: + + countries: List of geo unrestricted countries (similar + to _GEO_COUNTRIES) + ip_blocks: List of geo unrestricted IP blocks in CIDR notation + (similar to _GEO_IP_BLOCKS) + """ if not self._x_forwarded_for_ip: - country_code = self._downloader.params.get('geo_bypass_country', None) - # If there is no explicit country for geo bypass specified and - # the extractor is known to be geo restricted let's fake IP - # as X-Forwarded-For right away. - if (not country_code and - self._GEO_BYPASS and - self._downloader.params.get('geo_bypass', True) and - countries): - country_code = random.choice(countries) - if country_code: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + + # Geo bypass mechanism is explicitly disabled by user + if not self._downloader.params.get('geo_bypass', True): + return + + if not geo_bypass_context: + geo_bypass_context = {} + + # Backward compatibility: previously _initialize_geo_bypass + # expected a list of countries, some 3rd party code may still use + # it this way + if isinstance(geo_bypass_context, (list, tuple)): + geo_bypass_context = { + 'countries': geo_bypass_context, + } + + # The whole point of geo bypass mechanism is to fake IP + # as X-Forwarded-For HTTP header based on some IP block or + # country code. + + # Path 1: bypassing based on IP block in CIDR notation + + # Explicit IP block specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + ip_block = self._downloader.params.get('geo_bypass_ip_block', None) + + # Otherwise use random IP block from geo bypass context but only + # if extractor is known as geo bypassable + if not ip_block: + ip_blocks = geo_bypass_context.get('ip_blocks') + if self._GEO_BYPASS and ip_blocks: + ip_block = random.choice(ip_blocks) + + if ip_block: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s as X-Forwarded-For.' + % self._x_forwarded_for_ip) + return + + # Path 2: bypassing based on country code + + # Explicit country code specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + country = self._downloader.params.get('geo_bypass_country', None) + + # Otherwise use random country code from geo bypass context but + # only if extractor is known as geo bypassable + if not country: + countries = geo_bypass_context.get('countries') + if self._GEO_BYPASS and countries: + country = random.choice(countries) + + if country: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) if self._downloader.params.get('verbose', False): self._downloader.to_screen( '[debug] Using fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) + % (self._x_forwarded_for_ip, country.upper())) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index b73446773..8e0374320 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -102,7 +102,9 @@ class DPlayIE(InfoExtractor): display_id = mobj.group('id') domain = mobj.group('domain') - self._initialize_geo_bypass([mobj.group('country').upper()]) + self._initialize_geo_bypass({ + 'countries': [mobj.group('country').upper()], + }) webpage = self._download_webpage(url, display_id) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 9c7b1bd37..e781405f2 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -123,7 +123,7 @@ class GoIE(AdobePassIE): 'adobe_requestor_id': requestor_id, }) else: - self._initialize_geo_bypass(['US']) + self._initialize_geo_bypass({'countries': ['US']}) entitlement = self._download_json( 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', video_id, data=urlencode_postdata(data)) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 2803d7e8d..729d8de50 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -282,7 +282,9 @@ class LimelightMediaIE(LimelightBaseIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) pc, mobile, metadata = self._extract( video_id, 'getPlaylistByMediaId', diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 84597b55e..e09b5f804 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -227,14 +227,16 @@ class TVPlayIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, 'geo country', default=None) if geo_country: - self._initialize_geo_bypass([geo_country.upper()]) + self._initialize_geo_bypass({'countries': [geo_country.upper()]}) video = self._download_json( 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3e4ac03a2..f3f8f23b6 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -249,6 +249,10 @@ def parseOpts(overrideArguments=None): '--geo-bypass-country', metavar='CODE', dest='geo_bypass_country', default=None, help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') + geo.add_option( + '--geo-bypass-ip-block', metavar='IP_BLOCK', + dest='geo_bypass_ip_block', default=None, + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation (experimental)') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b460393bf..f9ca63c58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3534,10 +3534,13 @@ class GeoUtils(object): } @classmethod - def random_ipv4(cls, code): - block = cls._country_ip_map.get(code.upper()) - if not block: - return None + def random_ipv4(cls, code_or_block): + if len(code_or_block) == 2: + block = cls._country_ip_map.get(code_or_block.upper()) + if not block: + return None + else: + block = code_or_block addr, preflen = block.split('/') addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] addr_max = addr_min | (0xffffffff >> int(preflen)) From ea1f5e5dbd6c58d4f0872a65b97611732f4b29bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 2 May 2018 07:21:24 +0700 Subject: [PATCH 115/488] [itv:btcc] Add extractor (closes #16139) --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/itv.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9fe3f649d..316c8199d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -477,7 +477,10 @@ from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE -from .itv import ITVIE +from .itv import ( + ITVIE, + ITVBTCCIE, +) from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 457b424a2..6a4f8a505 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -7,6 +7,7 @@ import json import re from .common import InfoExtractor +from .brightcove import BrightcoveNewIE from ..compat import ( compat_str, compat_etree_register_namespace, @@ -18,6 +19,7 @@ from ..utils import ( xpath_text, int_or_none, parse_duration, + smuggle_url, ExtractorError, determine_ext, ) @@ -260,3 +262,38 @@ class ITVIE(InfoExtractor): 'subtitles': subtitles, }) return info + + +class ITVBTCCIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'info_dict': { + 'id': 'btcc-2018-all-the-action-from-brands-hatch', + 'title': 'BTCC 2018: All the action from Brands Hatch', + }, + 'playlist_mincount': 9, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + # ITV does not like some GB IP ranges, so here are some + # IP blocks it accepts + 'geo_ip_blocks': [ + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + ], + 'referrer': url, + }), + ie=BrightcoveNewIE.ie_key(), video_id=video_id) + for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + + title = self._og_search_title(webpage, fatal=False) + + return self.playlist_result(entries, playlist_id, title) From 3cc0d0b8299308958bfe8b4c42c739505df27f50 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 2 May 2018 09:32:53 +0100 Subject: [PATCH 116/488] [discovery] extract Affiliate/Anonymous Auth Token from cookies(closes #14954) --- youtube_dl/extractor/discovery.py | 37 ++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 91449dcd8..3589bd428 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -5,7 +5,10 @@ import re import string from .discoverygo import DiscoveryGoBaseIE -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, try_get, @@ -55,15 +58,27 @@ class DiscoveryIE(DiscoveryGoBaseIE): video = next(cb for cb in content_blocks if cb.get('type') == 'video')['content']['items'][0] video_id = video['id'] - access_token = self._download_json( - 'https://www.%s.com/anonymous' % site, display_id, query={ - 'authRel': 'authorization', - 'client_id': try_get( - react_data, lambda x: x['application']['apiClientId'], - compat_str) or '3020a40c2356a645b4b4', - 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, - })['access_token'] + access_token = None + cookies = self._get_cookies(url) + + # prefer Affiliate Auth Token over Anonymous Auth Token + auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn') + if auth_storage_cookie and auth_storage_cookie.value: + auth_storage = self._parse_json(compat_urllib_parse_unquote( + compat_urllib_parse_unquote(auth_storage_cookie.value)), + video_id, fatal=False) or {} + access_token = auth_storage.get('a') or auth_storage.get('access_token') + + if not access_token: + access_token = self._download_json( + 'https://www.%s.com/anonymous' % site, display_id, query={ + 'authRel': 'authorization', + 'client_id': try_get( + react_data, lambda x: x['application']['apiClientId'], + compat_str) or '3020a40c2356a645b4b4', + 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, + })['access_token'] try: stream = self._download_json( @@ -72,7 +87,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): 'Authorization': 'Bearer ' + access_token, }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): e_description = self._parse_json( e.cause.read().decode(), display_id)['description'] if 'resource not available for country' in e_description: From a90a6b54ee5ceb6002f4ebd73d62c65cc00484d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 2 May 2018 20:43:34 +0700 Subject: [PATCH 117/488] [watchbox] Fix extraction (closes #16356) --- youtube_dl/extractor/watchbox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index b382338fa..be0bcba15 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -69,7 +69,7 @@ class WatchBoxIE(InfoExtractor): source = self._parse_json( self._search_regex( - r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source', + r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', default='{}'), video_id, transform_source=js_to_json, fatal=False) or {} From 660a230b2dcc734f018557c7898384ba438e9137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 May 2018 01:21:52 +0700 Subject: [PATCH 118/488] [cloudflarestream] Add support for cloudflare streams (closes #16375) --- youtube_dl/extractor/cloudflarestream.py | 60 ++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 19 ++++++++ 3 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/cloudflarestream.py diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py new file mode 100644 index 000000000..e6d92cca2 --- /dev/null +++ b/youtube_dl/extractor/cloudflarestream.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CloudflareStreamIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:watch\.)?cloudflarestream\.com/| + embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo= + ) + (?P<id>[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', + 'only_matching': True, + }, { + 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://cloudflarestream.com/%s/manifest/video.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False) + formats.extend(self._extract_mpd_formats( + 'https://cloudflarestream.com/%s/manifest/video.mpd' % video_id, + video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 316c8199d..a00e003c2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -195,6 +195,7 @@ from .clippit import ClippitIE from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 73980601c..532c995f5 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -107,6 +107,7 @@ from .springboardplatform import SpringboardPlatformIE from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE +from .cloudflarestream import CloudflareStreamIE class GenericIE(InfoExtractor): @@ -2013,6 +2014,19 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # CloudflareStream embed + 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'add_ie': [CloudflareStreamIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3025,6 +3039,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) + if cloudflarestream_urls: + return self.playlist_from_matches( + cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From 789b7774a771335c7d0b42c834195bef2e8617c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 6 May 2018 21:58:55 +0700 Subject: [PATCH 119/488] [businessinsider] Add extractor (closes #16387, closes #16388, closes #16389) --- youtube_dl/extractor/businessinsider.py | 42 +++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 15 --------- 3 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 youtube_dl/extractor/businessinsider.py diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py new file mode 100644 index 000000000..dfcf9bc6b --- /dev/null +++ b/youtube_dl/extractor/businessinsider.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BusinessInsiderIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'hZRllCfw', + 'ext': 'mp4', + 'title': "Here's how much radiation you're exposed to in everyday life", + 'description': 'md5:9a0d6e2c279948aadaa5e84d6d9b99bd', + 'upload_date': '20170709', + 'timestamp': 1499606400, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', + 'only_matching': True, + }, { + 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex( + (r'data-media-id=["\']([a-zA-Z0-9]{8})', + r'id=["\']jwplayer_([a-zA-Z0-9]{8})', + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})'), + webpage, 'jwplatform id') + return self.url_result( + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a00e003c2..f03f98a6c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -137,6 +137,7 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 532c995f5..76852f9dc 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1472,21 +1472,6 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to parse JSON Expecting value'], }, - # Ooyala embed - { - 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', - 'info_dict': { - 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', - 'ext': 'mp4', - 'description': 'Index/Match versus VLOOKUP.', - 'title': 'This is what separates the Excel masters from the wannabes', - 'duration': 191.933, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - } - }, # Brightcove URL in single quotes { 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', From 0ce76801e8f6e4d69182c20d9cef4de772555ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 8 May 2018 22:33:35 +0700 Subject: [PATCH 120/488] [udemy] Extract stream URLs (closes #16372) --- youtube_dl/extractor/udemy.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index bf1134e3f..4664e6222 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -105,7 +105,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,data', }) def _handle_error(self, response): @@ -303,9 +303,10 @@ class UdemyIE(InfoExtractor): 'url': src, }) - download_urls = asset.get('download_urls') - if isinstance(download_urls, dict): - extract_formats(download_urls.get('Video')) + for url_kind in ('download', 'stream'): + urls = asset.get('%s_urls' % url_kind) + if isinstance(urls, dict): + extract_formats(urls.get('Video')) view_html = lecture.get('view_html') if view_html: From 2fbd86352eaa9df6afeed6698114132aea3cbe81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 8 May 2018 22:57:01 +0700 Subject: [PATCH 121/488] [udemy] Extract asset captions --- youtube_dl/extractor/udemy.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 4664e6222..0a74a9768 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -18,6 +18,7 @@ from ..utils import ( int_or_none, js_to_json, sanitized_Request, + try_get, unescapeHTML, urlencode_postdata, ) @@ -105,7 +106,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', }) def _handle_error(self, response): @@ -308,6 +309,21 @@ class UdemyIE(InfoExtractor): if isinstance(urls, dict): extract_formats(urls.get('Video')) + captions = asset.get('captions') + if isinstance(captions, list): + for cc in captions: + if not isinstance(cc, dict): + continue + cc_url = cc.get('url') + if not cc_url or not isinstance(cc_url, compat_str): + continue + lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) + sub_dict = (automatic_captions if cc.get('source') == 'auto' + else subtitles) + sub_dict.setdefault(lang or 'en', []).append({ + 'url': cc_url, + }) + view_html = lecture.get('view_html') if view_html: view_html_urls = set() From 05108a496a0eb6ca5d6f93072e2871dec8958b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 8 May 2018 22:57:52 +0700 Subject: [PATCH 122/488] [YoutubeDL] Ensure ext exists for automatic captions --- youtube_dl/YoutubeDL.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f1a359011..046e03247 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1482,23 +1482,28 @@ class YoutubeDL(object): if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') subtitles = info_dict.get('subtitles') - if subtitles: - for _, subtitle in subtitles.items(): - for subtitle_format in subtitle: - if subtitle_format.get('url'): - subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if subtitle_format.get('ext') is None: - subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: - self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return + info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], subtitles, - info_dict.get('automatic_captions')) + info_dict['id'], subtitles, automatic_captions) # We now pick which formats have to be downloaded if info_dict.get('formats') is None: From 44277998adae1e17e4d21208e7dd1ad44decc733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 9 May 2018 00:34:39 +0700 Subject: [PATCH 123/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 916b8edb8..ab6c5dab6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Core +* [YoutubeDL] Ensure ext exists for automatic captions +* Introduce --geo-bypass-ip-block + +Extractors ++ [udemy] Extract asset captions ++ [udemy] Extract stream URLs (#16372) ++ [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) ++ [cloudflarestream] Add support for cloudflarestream.com (#16375) +* [watchbox] Fix extraction (#16356) +* [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) ++ [itv:btcc] Add support for itv.com/btcc (#16139) +* [tunein] Use live title for live streams (#16347) +* [itv] Improve extraction (#16253) + + version 2018.05.01 Core From 9e18bb4c67af7b748ee62247d751c0e705aa791a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 9 May 2018 00:36:47 +0700 Subject: [PATCH 124/488] release 2018.05.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 3 +++ docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c2bd5d8ae..b2bfa9ec5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.01 +[debug] youtube-dl version 2018.05.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ab6c5dab6..ef6cc3850 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.05.09 Core * [YoutubeDL] Ensure ext exists for automatic captions diff --git a/README.md b/README.md index 5af0f387b..d9fe2350a 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental) + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with + explicitly provided IP block in CIDR + notation (experimental) ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c5a48002b..88fac6e90 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -122,6 +122,7 @@ - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen + - **BusinessInsider** - **BuzzFeed** - **BYUtv** - **Camdemy** @@ -163,6 +164,7 @@ - **ClipRs** - **Clipsyndicate** - **CloserToTruth** + - **CloudflareStream** - **cloudtime**: CloudTime - **Cloudy** - **Clubic** @@ -373,6 +375,7 @@ - **Ir90Tv** - **ITTF** - **ITV** + - **ITVBTCC** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 04896efc8..6f47b1795 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.01' +__version__ = '2018.05.09' From ff8889cd4dfae0ae3758e3d8a496f5724f6dc092 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 10 May 2018 08:19:32 +0100 Subject: [PATCH 125/488] [teamcoco] fix extraction(closes #16374) --- youtube_dl/extractor/teamcoco.py | 175 ++++++++++++++----------------- 1 file changed, 80 insertions(+), 95 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 9056c8cbc..f06e5b19a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,35 +1,34 @@ # coding: utf-8 from __future__ import unicode_literals -import binascii -import re import json from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_ord, -) from ..utils import ( - ExtractorError, - qualities, determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + parse_duration, + parse_iso8601, + qualities, ) class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P<id>[^/?#]+)' _TESTS = [ { - 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', - 'md5': '3f7746aa0dc86de18df7539903d399ea', + 'url': 'http://teamcoco.com/video/mary-kay-remote', + 'md5': '55d532f81992f5c92046ad02fec34d7d', 'info_dict': { 'id': '80187', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', - 'duration': 504, - 'age_limit': 0, + 'duration': 495.0, + 'upload_date': '20140402', + 'timestamp': 1396407600, } }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -40,7 +39,8 @@ class TeamcocoIE(InfoExtractor): 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'duration': 288, - 'age_limit': 0, + 'upload_date': '20111104', + 'timestamp': 1320405840, } }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', @@ -49,6 +49,8 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', + 'upload_date': '20150415', + 'timestamp': 1429088400, }, 'params': { 'skip_download': True, # m3u8 downloads @@ -63,110 +65,93 @@ class TeamcocoIE(InfoExtractor): }, 'params': { 'skip_download': True, # m3u8 downloads - } + }, + 'skip': 'This video is no longer available.', } ] - _VIDEO_ID_REGEXES = ( - r'"eVar42"\s*:\s*(\d+)', - r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', - r'"id_not"\s*:\s*(\d+)' - ) + + def _graphql_call(self, query_template, object_type, object_id): + find_object = 'find' + object_type + return self._download_json( + 'http://teamcoco.com/graphql/', object_id, data=json.dumps({ + 'query': query_template % (find_object, object_id) + }))['data'][find_object] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + display_id = self._match_id(url) - display_id = mobj.group('display_id') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'src=expired' in urlh.geturl(): - raise ExtractorError('This video is expired.', expected=True) + response = self._graphql_call('''{ + %s(slug: "video/%s") { + ... on RecordSlug { + record { + id + title + teaser + publishOn + thumb { + preview + } + tags { + name + } + duration + } + } + ... on NotFoundSlug { + status + } + } +}''', 'Slug', display_id) + if response.get('status'): + raise ExtractorError('This video is no longer available.', expected=True) - video_id = mobj.group('video_id') - if not video_id: - video_id = self._html_search_regex( - self._VIDEO_ID_REGEXES, webpage, 'video id') + record = response['record'] + video_id = record['id'] - data = None - - preload_codes = self._html_search_regex( - r'(function.+)setTimeout\(function\(\)\{playlist', - webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes) - base64_fragments.remove('init') - - def _check_sequence(cur_fragments): - if not cur_fragments: - return - for i in range(len(cur_fragments)): - cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') - try: - raw_data = compat_b64decode(cur_sequence) - if compat_ord(raw_data[0]) == compat_ord('{'): - return json.loads(raw_data.decode('utf-8')) - except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): - continue - - def _check_data(): - for i in range(len(base64_fragments) + 1): - for j in range(i, len(base64_fragments) + 1): - data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) - if data: - return data - - self.to_screen('Try to compute possible data sequence. This may take some time.') - data = _check_data() - - if not data: - raise ExtractorError( - 'Preload information could not be extracted', expected=True) + srcs = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id)['src'] formats = [] - get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) - for filed in data['files']: - if determine_ext(filed['url']) == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if filed['url'].startswith('/'): - m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] - else: - m3u8_url = filed['url'] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4') - for m3u8_format in m3u8_formats: - if m3u8_format not in formats: - formats.append(m3u8_format) - elif determine_ext(filed['url']) == 'f4m': - # TODO Correct f4m extraction + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in srcs.items(): + if not isinstance(src, dict): continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) else: - if filed['url'].startswith('/mp4:protected/'): + if src_url.startswith('/mp4:protected/'): # TODO Correct extraction for these files continue - m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) - if m_format is not None: - format_id = m_format.group(1) - else: - format_id = filed['bitrate'] - tbr = ( - int(filed['bitrate']) - if filed['bitrate'].isdigit() - else None) + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) formats.append({ - 'url': filed['url'], - 'ext': 'mp4', + 'url': src_url, + 'ext': ext, 'tbr': tbr, 'format_id': format_id, 'quality': get_quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'formats': formats, - 'title': data['title'], - 'thumbnail': data.get('thumb', {}).get('href'), - 'description': data.get('teaser'), - 'duration': data.get('duration'), - 'age_limit': self._family_friendly_search(webpage), + 'title': record['title'], + 'thumbnail': record.get('thumb', {}).get('preview'), + 'description': record.get('teaser'), + 'duration': parse_duration(record.get('duration')), + 'timestamp': parse_iso8601(record.get('publishOn')), } From 1344d3e169840f6c9d585648c1597da6a2b00ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 May 2018 22:01:13 +0700 Subject: [PATCH 126/488] [nickbr] Relax _VALID_URL (#13230) --- youtube_dl/extractor/nick.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 256a24d86..5e34d776b 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -85,7 +85,7 @@ class NickBrIE(MTVServicesInfoExtractor): https?:// (?: (?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br| - (?:www\.)?nickjr\.nl + (?:www\.)?nickjr\.[a-z]{2} ) /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?\#.]+) ''' @@ -98,6 +98,9 @@ class NickBrIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', 'only_matching': True, + }, { + 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', + 'only_matching': True, }] def _real_extract(self, url): From bc5e4aa57e92201610f9ab79b10a3ae3b316fc3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 May 2018 22:22:26 +0700 Subject: [PATCH 127/488] [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) --- youtube_dl/extractor/mixcloud.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a56b7690f..b7bccb504 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -179,6 +179,10 @@ class MixcloudIE(InfoExtractor): formats.append({ 'format_id': 'http', 'url': decrypted, + 'downloader_options': { + # Mixcloud starts throttling at >~5M + 'http_chunk_size': 5242880, + }, }) self._sort_formats(formats) From dbd5c502ead468771d45c7893dd5dd14cf99a276 Mon Sep 17 00:00:00 2001 From: llyyr <gopalprasadgd@gmail.com> Date: Thu, 10 May 2018 21:47:23 +0530 Subject: [PATCH 128/488] [redditr] Relax _VALID_URL (closes #16426) --- youtube_dl/extractor/reddit.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 8372925be..7b0aa6232 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -47,7 +47,7 @@ class RedditIE(InfoExtractor): class RedditRIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -86,6 +86,10 @@ class RedditRIE(InfoExtractor): # youtube 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, }] def _real_extract(self, url): From 49fa7de301019e23e66c01e5007561eefd51ca47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 11 May 2018 23:20:12 +0700 Subject: [PATCH 129/488] [twitch:clips] Fix extraction (closes #16429) --- youtube_dl/extractor/twitch.py | 102 ++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4c11fd3c3..ec96ae506 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -8,6 +8,7 @@ import random from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_kwargs, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -16,11 +17,14 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + float_or_none, int_or_none, - js_to_json, orderedSet, parse_duration, parse_iso8601, + qualities, + try_get, + unified_timestamp, update_url_query, urlencode_postdata, urljoin, @@ -45,10 +49,11 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _call_api(self, path, item_id, note): + def _call_api(self, path, item_id, *args, **kwargs): + kwargs.setdefault('headers', {})['Client-ID'] = self._CLIENT_ID response = self._download_json( - '%s/%s' % (self._API_BASE, path), item_id, note, - headers={'Client-ID': self._CLIENT_ID}) + '%s/%s' % (self._API_BASE, path), item_id, + *args, **compat_kwargs(kwargs)) self._handle_error(response) return response @@ -622,21 +627,23 @@ class TwitchStreamIE(TwitchBaseIE): } -class TwitchClipsIE(InfoExtractor): +class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', + 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { - 'id': 'AggressiveCobraPoooound', + 'id': '42850523', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1465767393, + 'upload_date': '20160612', 'creator': 'EA', 'uploader': 'stereotype_', - 'uploader_id': 'stereotype_', + 'uploader_id': '43566419', }, }, { # multiple formats @@ -647,34 +654,61 @@ class TwitchClipsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + status = self._download_json( + 'https://clips.twitch.tv/api/v2/clips/%s/status' % video_id, + video_id) - clip = self._parse_json( - self._search_regex( - r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), - video_id, transform_source=js_to_json) + formats = [] - title = clip.get('title') or clip.get('channel_title') or self._og_search_title(webpage) + for option in status['quality_options']: + if not isinstance(option, dict): + continue + source = option.get('source') + if not source or not isinstance(source, compat_str): + continue + formats.append({ + 'url': source, + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + 'fps': int_or_none(option.get('frame_rate')), + }) - formats = [{ - 'url': option['source'], - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - } for option in clip.get('quality_options', []) if option.get('source')] - - if not formats: - formats = [{ - 'url': clip['clip_video_url'], - }] - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), - 'uploader': clip.get('curator_login'), - 'uploader_id': clip.get('curator_display_name'), + info = { 'formats': formats, } + + clip = self._call_api( + 'kraken/clips/%s' % video_id, video_id, fatal=False, headers={ + 'Accept': 'application/vnd.twitchtv.v5+json', + }) + + if clip: + quality_key = qualities(('tiny', 'small', 'medium')) + thumbnails = [] + thumbnails_dict = clip.get('thumbnails') + if isinstance(thumbnails_dict, dict): + for thumbnail_id, thumbnail_url in thumbnails_dict.items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': quality_key(thumbnail_id), + }) + + info.update({ + 'id': clip.get('tracking_id') or video_id, + 'title': clip.get('title') or video_id, + 'duration': float_or_none(clip.get('duration')), + 'views': int_or_none(clip.get('views')), + 'timestamp': unified_timestamp(clip.get('created_at')), + 'thumbnails': thumbnails, + 'creator': try_get(clip, lambda x: x['broadcaster']['display_name'], compat_str), + 'uploader': try_get(clip, lambda x: x['curator']['display_name'], compat_str), + 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), + }) + else: + info.update({ + 'title': video_id, + 'id': video_id, + }) + + return info From 07acdc5afcff3b47b26b26355a75704e3cda670f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 12 May 2018 12:08:54 +0700 Subject: [PATCH 130/488] [twitch:clips] Sort formats --- youtube_dl/extractor/twitch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ec96ae506..3ee2af52e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -673,6 +673,8 @@ class TwitchClipsIE(TwitchBaseIE): 'fps': int_or_none(option.get('frame_rate')), }) + self._sort_formats(formats) + info = { 'formats': formats, } From 90b633f86b000f8b6a58ce99d9bbbe0fff6d4f62 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 13 May 2018 11:30:21 +0100 Subject: [PATCH 131/488] [nbc] improve info extraction(fixes #16440) --- youtube_dl/extractor/nbc.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 9dc8f9ebc..1b1722cfa 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,6 +9,7 @@ from .adobepass import AdobePassIE from ..utils import ( find_xpath_attr, smuggle_url, + try_get, unescapeHTML, update_url_query, int_or_none, @@ -78,10 +79,14 @@ class NBCIE(AdobePassIE): def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() permalink = 'http' + permalink - video_data = self._download_json( + response = self._download_json( 'https://api.nbc.com/v3/videos', video_id, query={ 'filter[permalink]': permalink, - })['data'][0]['attributes'] + 'fields[videos]': 'description,entitlement,episodeNumber,guid,keywords,seasonNumber,title,vChipRating', + 'fields[shows]': 'shortTitle', + 'include': 'show.shortTitle', + }) + video_data = response['data'][0]['attributes'] query = { 'mbr': 'true', 'manifest': 'm3u', @@ -103,10 +108,11 @@ class NBCIE(AdobePassIE): 'title': title, 'url': theplatform_url, 'description': video_data.get('description'), - 'keywords': video_data.get('keywords'), + 'tags': video_data.get('keywords'), 'season_number': int_or_none(video_data.get('seasonNumber')), 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('showName'), + 'episode': title, + 'series': try_get(response, lambda x: x['included'][0]['attributes']['shortTitle']), 'ie_key': 'ThePlatform', } From 4c76aa06665621c7689938afd7bbdbc797b5c7ea Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 13 May 2018 13:20:16 +0100 Subject: [PATCH 132/488] [youtube] fix extraction for embed restricted live streams(fixes #16433) --- youtube_dl/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 04aeb91af..1f29e8a4e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1537,7 +1537,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map'): + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) @@ -1969,9 +1969,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' formats.append(a_format) else: - unavailable_message = extract_unavailable_message() - if unavailable_message: - raise ExtractorError(unavailable_message, expected=True) + error_message = clean_html(video_info.get('reason', [None])[0]) + if not error_message: + error_message = extract_unavailable_message() + if error_message: + raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest From 84a9fef899374d46cfad8d292187ca8d84791c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 13 May 2018 22:49:01 +0700 Subject: [PATCH 133/488] [youtube] Make uploader extraction non fatal (#16444) --- youtube_dl/extractor/youtube.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f29e8a4e..897398d20 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1697,9 +1697,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_information_extraction(video_id) # uploader - if 'author' not in video_info: - raise ExtractorError('Unable to extract uploader name') - video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0]) + video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') # uploader_id video_uploader_id = None From c63ca0eef8ac147b3f2a39ba7265ad1b3c11d516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 14 May 2018 23:27:56 +0700 Subject: [PATCH 134/488] [youtube] Improve format filesize extraction (#16453) --- youtube_dl/extractor/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 897398d20..7f4298c08 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1815,6 +1815,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): chapters = self._extract_chapters(description_original, video_duration) + def _extract_filesize(media_url): + return int_or_none(self._search_regex( + r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1919,8 +1923,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + filesize = int_or_none(url_data.get( + 'clen', [None])[0]) or _extract_filesize(url) + more_fields = { - 'filesize': int_or_none(url_data.get('clen', [None])[0]), + 'filesize': filesize, 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, @@ -1994,6 +2001,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for df in self._extract_mpd_formats( mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): + if not df.get('filesize'): + df['filesize'] = _extract_filesize(df['url']) # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df From 1e4fe5a7cc80f73b92e068515352d7c7124a49c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 14 May 2018 23:42:33 +0700 Subject: [PATCH 135/488] [options] Fix typo (closes #16450) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index f3f8f23b6..b692c6b3b 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -232,7 +232,7 @@ def parseOpts(overrideArguments=None): '--geo-verification-proxy', dest='geo_verification_proxy', default=None, metavar='URL', help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading.') + 'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading.') geo.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', From 7f34984e811897b65e1b7e3a25cfdb45bf863dcf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 08:08:44 +0100 Subject: [PATCH 136/488] [dtube] Add new extractor(closes #15201) --- youtube_dl/extractor/dtube.py | 83 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 84 insertions(+) create mode 100644 youtube_dl/extractor/dtube.py diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py new file mode 100644 index 000000000..4ca97f860 --- /dev/null +++ b/youtube_dl/extractor/dtube.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re +from socket import timeout + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class DTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})' + _TEST = { + 'url': 'https://d.tube/#!/v/benswann/zqd630em', + 'md5': 'a03eaa186618ffa7a3145945543a251e', + 'info_dict': { + 'id': 'zqd630em', + 'ext': 'mp4', + 'title': 'Reality Check: FDA\'s Disinformation Campaign on Kratom', + 'description': 'md5:700d164e066b87f9eac057949e4227c2', + 'uploader_id': 'benswann', + 'upload_date': '20180222', + 'timestamp': 1519328958, + }, + 'params': { + 'format': '480p', + }, + } + + def _real_extract(self, url): + uploader_id, video_id = re.match(self._VALID_URL, url).groups() + result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({ + 'jsonrpc': '2.0', + 'method': 'get_content', + 'params': [uploader_id, video_id], + }).encode())['result'] + + metadata = json.loads(result['json_metadata']) + video = metadata['video'] + content = video['content'] + info = video.get('info', {}) + title = info.get('title') or result['title'] + + def canonical_url(h): + if not h: + return None + return 'https://ipfs.io/ipfs/' + h + + formats = [] + for q in ('240', '480', '720', '1080', ''): + video_url = canonical_url(content.get('video%shash' % q)) + if not video_url: + continue + format_id = (q + 'p') if q else 'Source' + try: + self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) + self._downloader._opener.open(video_url, timeout=5).close() + except timeout as e: + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, format_id)) + continue + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'height': int_or_none(q), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('description'), + 'thumbnail': canonical_url(info.get('snaphash')), + 'tags': content.get('tags') or metadata.get('tags'), + 'duration': info.get('duration'), + 'formats': formats, + 'timestamp': parse_iso8601(result.get('created')), + 'uploader_id': uploader_id, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f03f98a6c..4da477647 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -283,6 +283,7 @@ from .drtv import ( DRTVIE, DRTVLiveIE, ) +from .dtube import DTubeIE from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE From fe3a60f040f614d36e99f80ea1e3a8387d995fff Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 11:24:44 +0100 Subject: [PATCH 137/488] [dreisat] improve extraction(closes #15350) - extract all formats - extract more format metadata extraction - improve format sorting - use hls native downloader - detect geo-restriction - bypass geo-restriction --- youtube_dl/extractor/dreisat.py | 141 +++++++++++++++----------------- 1 file changed, 65 insertions(+), 76 deletions(-) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index f138025d5..8d31258c1 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -8,7 +8,6 @@ from ..utils import ( unified_strdate, xpath_text, determine_ext, - qualities, float_or_none, ExtractorError, ) @@ -16,7 +15,8 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _GEO_COUNTRIES = ['DE'] + _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', @@ -43,7 +43,8 @@ class DreiSatIE(InfoExtractor): def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): param_groups = {} for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) + group_id = param_group.get(self._xpath_ns( + 'id', 'http://www.w3.org/XML/1998/namespace')) params = {} for param in param_group: params[param.get('name')] = param.get('value') @@ -54,7 +55,7 @@ class DreiSatIE(InfoExtractor): src = video.get('src') if not src: continue - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) group_id = video.get('paramGroup') param_group = param_groups[group_id] for proto in param_group['protocols'].split(','): @@ -75,66 +76,36 @@ class DreiSatIE(InfoExtractor): note='Downloading video info', errnote='Failed to download video info') - status_code = doc.find('./status/statuscode') - if status_code is not None and status_code.text != 'ok': - code = status_code.text - if code == 'notVisibleAnymore': + status_code = xpath_text(doc, './status/statuscode') + if status_code and status_code != 'ok': + if status_code == 'notVisibleAnymore': message = 'Video %s is not available' % video_id else: - message = '%s returned error: %s' % (self.IE_NAME, code) + message = '%s returned error: %s' % (self.IE_NAME, status_code) raise ExtractorError(message, expected=True) - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + title = xpath_text(doc, './/information/title', 'title', True) - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - return thumbnails - - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - - format_nodes = doc.findall('.//formitaeten/formitaet') - quality = qualities(['veryhigh', 'high', 'med', 'low']) - - def get_quality(elem): - return quality(xpath_text(elem, 'quality')) - format_nodes.sort(key=get_quality) - format_ids = [] + urls = [] formats = [] - for fnode in format_nodes: - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - if not is_available: + for fnode in doc.findall('.//formitaeten/formitaet'): + video_url = xpath_text(fnode, 'url') + if not video_url or video_url in urls: continue + urls.append(video_url) + + is_available = 'http://www.metafilegenerator' not in video_url + geoloced = 'static_geoloced_online' in video_url + if not is_available or geoloced: + continue + format_id = fnode.attrib['basetype'] - quality = xpath_text(fnode, './quality', 'quality') format_m = re.match(r'''(?x) (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) ''', format_id) ext = determine_ext(video_url, None) or format_m.group('container') - if ext not in ('smil', 'f4m', 'm3u8'): - format_id = format_id + '-' + quality - if format_id in format_ids: - continue if ext == 'meta': continue @@ -147,24 +118,23 @@ class DreiSatIE(InfoExtractor): if video_url.startswith('https://'): continue formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id=format_id, fatal=False)) else: - proto = format_m.group('proto').lower() + quality = xpath_text(fnode, './quality') + if quality: + format_id += '-' + quality - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None + tbr = int_or_none(self._search_regex( + r'_(\d+)k', video_url, 'bitrate', None)) + if tbr and vbr and not abr: + abr = tbr - vbr formats.append({ 'format_id': format_id, @@ -174,31 +144,50 @@ class DreiSatIE(InfoExtractor): 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, + 'tbr': tbr, + 'width': int_or_none(xpath_text(fnode, './width')), + 'height': int_or_none(xpath_text(fnode, './height')), + 'filesize': int_or_none(xpath_text(fnode, './filesize')), + 'protocol': format_m.group('proto').lower(), }) - format_ids.append(format_id) + + geolocation = xpath_text(doc, './/details/geolocation') + if not formats and geolocation and geolocation != 'none': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) self._sort_formats(formats) + thumbnails = [] + for node in doc.findall('.//teaserimages/teaserimage'): + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + thumbnail_key = node.get('key') + if thumbnail_key: + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': xpath_text(doc, './/information/detail'), + 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'uploader': xpath_text(doc, './/details/originChannelTitle'), + 'uploader_id': xpath_text(doc, './/details/originChannelId'), 'upload_date': upload_date, 'formats': formats, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + video_id = self._match_id(url) + details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id return self.extract_from_xml_url(video_id, details_url) From 997530d9d472285126bdfb642915062f286d38a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 12:04:24 +0100 Subject: [PATCH 138/488] [dailymotion] remove fragment part from m3u8 urls(closes #8915) --- youtube_dl/extractor/dailymotion.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 0e7d587dd..de27fffd4 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -180,9 +180,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): continue ext = mimetype2ext(type_) or determine_ext(media_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + f['url'] = f['url'].split('#')[0] + formats.append(f) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) From 54fc90aabfb71968f28af68dfe3f7a3544cc2f0b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 16:24:44 +0100 Subject: [PATCH 139/488] [youtube] fix hd720 format position --- youtube_dl/extractor/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7f4298c08..e4eec7c30 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -37,6 +37,7 @@ from ..utils import ( orderedSet, parse_codecs, parse_duration, + qualities, remove_quotes, remove_start, smuggle_url, @@ -1844,6 +1845,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(width_height[0]), 'height': int_or_none(width_height[1]), } + q = qualities(['small', 'medium', 'hd720']) formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) @@ -1926,13 +1928,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) + quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0] + more_fields = { 'filesize': filesize, 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], + 'format_note': quality, + 'quality': q(quality), } for key, value in more_fields.items(): if value: From 6843ac5b1395157608324be71dc84803b3495857 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 17:49:35 +0100 Subject: [PATCH 140/488] add support for paramountnetwork.com and bellator.com(fixes #15418) --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/spike.py | 63 +++++++++++++----------------- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4da477647..48e3da9c4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1016,7 +1016,10 @@ from .spankbang import SpankBangIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE -from .spike import SpikeIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index a7b1b3b5f..e76522b45 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -1,55 +1,46 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -class SpikeIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spike\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' +class BellatorIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' _TESTS = [{ - 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', - 'md5': '1a9265f32b0c375793d6c4ce45255256', + 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { - 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4', 'ext': 'mp4', - 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', - 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', - 'timestamp': 1388120400, - 'upload_date': '20131227', + 'title': 'Douglas Lima vs. Paul Daley - Round 1', + 'description': 'md5:805a8dd29310fd611d32baba2f767885', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { - 'url': 'http://www.spike.com/full-episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-209', - 'md5': 'b25c6f16418aefb9ad5a6cae2559321f', + 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] + + _FEED_URL = 'http://www.spike.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + +class ParamountNetworkIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', 'info_dict': { 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', 'ext': 'mp4', 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', }, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m/', - 'only_matching': True, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', - 'only_matching': True, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] - _FEED_URL = 'http://www.spike.com/feeds/mrss/' - _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' - _CUSTOM_URL_REGEX = re.compile(r'spikenetworkapp://([^/]+/[-a-fA-F0-9]+)') + _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - - def _extract_mgid(self, webpage): - mgid = super(SpikeIE, self)._extract_mgid(webpage) - if mgid is None: - url_parts = self._search_regex(self._CUSTOM_URL_REGEX, webpage, 'episode_id') - video_type, episode_id = url_parts.split('/', 1) - mgid = 'mgid:arc:{0}:spike.com:{1}'.format(video_type, episode_id) - return mgid From eea2fafcf506336e37ca514f72757acf8ee004af Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 18:34:25 +0100 Subject: [PATCH 141/488] [pbs] fix embed data extraction(fixes #16474) --- youtube_dl/extractor/pbs.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f11d5da52..a28ee17ca 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -505,7 +505,7 @@ class PBSIE(InfoExtractor): if player: video_info = self._parse_json( self._search_regex( - r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], player, '%s video data' % page, default='{}'), display_id, transform_source=js_to_json, fatal=False) if video_info: @@ -513,10 +513,14 @@ class PBSIE(InfoExtractor): if not info: info = video_info if not chapters: - for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): - chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) - if not chapter: - continue + raw_chapters = video_info.get('chapters') or [] + if not raw_chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + raw_chapters.append(chapter) + for chapter in raw_chapters: start_time = float_or_none(chapter.get('start_time'), 1000) duration = float_or_none(chapter.get('duration'), 1000) if start_time is None or duration is None: From 58a68d8fdae5358273ee52d05d77fe42094e128e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 18:44:33 +0100 Subject: [PATCH 142/488] [moniker] Remove extractor(closes #15336) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/moniker.py | 116 ----------------------------- 2 files changed, 117 deletions(-) delete mode 100644 youtube_dl/extractor/moniker.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 48e3da9c4..24c23646c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -625,7 +625,6 @@ from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE -from .moniker import MonikerIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py deleted file mode 100644 index b208820fe..000000000 --- a/youtube_dl/extractor/moniker.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os.path -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - remove_start, - sanitized_Request, - urlencode_postdata, -) - - -class MonikerIE(InfoExtractor): - IE_DESC = 'allmyvideos.net and vidspot.net' - _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P<id>[a-zA-Z0-9_-]+)' - - _TESTS = [{ - 'url': 'http://allmyvideos.net/jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://vidspot.net/l2ngsmhs8ci5', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'l2ngsmhs8ci5', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', - 'only_matching': True, - }, { - 'url': 'http://vidspot.net/2/v-ywDf99', - 'md5': '5f8254ce12df30479428b0152fb8e7ba', - 'info_dict': { - 'id': 'ywDf99', - 'ext': 'mp4', - 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)', - 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.', - }, - }, { - 'url': 'http://allmyvideos.net/v/v-HXZm5t', - 'only_matching': True, - }] - - def _real_extract(self, url): - orig_video_id = self._match_id(url) - video_id = remove_start(orig_video_id, 'embed-') - url = url.replace(orig_video_id, video_id) - assert re.match(self._VALID_URL, url) is not None - orig_webpage = self._download_webpage(url, video_id) - - if '>File Not Found<' in orig_webpage: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - error = self._search_regex( - r'class="err">([^<]+)<', orig_webpage, 'error', default=None) - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - builtin_url = self._search_regex( - r'<iframe[^>]+src=(["\'])(?P<url>.+?/builtin-.+?)\1', - orig_webpage, 'builtin URL', default=None, group='url') - - if builtin_url: - req = sanitized_Request(builtin_url) - req.add_header('Referer', url) - webpage = self._download_webpage(req, video_id, 'Downloading builtin page') - title = self._og_search_title(orig_webpage).strip() - description = self._og_search_description(orig_webpage).strip() - else: - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = dict(fields) - - post = urlencode_postdata(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - description = None - - # Could be several links with different quality - links = re.findall(r'"file" : "?(.+?)",', webpage) - # Assume the links are ordered in quality - formats = [{ - 'url': l, - 'quality': i, - } for i, l in enumerate(links)] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - } From 1306f5ed726b9f8778a5cc0586436b555f64c2ff Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 May 2018 19:11:48 +0100 Subject: [PATCH 143/488] [mychannels] add support for mychannels.com(closes #15334) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/minoto.py | 19 +++++++----------- .../{makerschannel.py => mychannels.py} | 20 +++++++++---------- 3 files changed, 18 insertions(+), 23 deletions(-) rename youtube_dl/extractor/{makerschannel.py => mychannels.py} (59%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 24c23646c..7d5927131 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -582,7 +582,6 @@ from .mailru import ( MailRuMusicIE, MailRuMusicSearchIE, ) -from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .mangomolo import ( MangomoloVideoIE, @@ -645,6 +644,7 @@ from .mtv import ( from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE from .mwave import MwaveIE, MwaveMeetGreetIE +from .mychannels import MyChannelsIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import ( diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py index 959a10589..636731195 100644 --- a/youtube_dl/extractor/minoto.py +++ b/youtube_dl/extractor/minoto.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_codecs, +) class MinotoIE(InfoExtractor): @@ -26,7 +29,7 @@ class MinotoIE(InfoExtractor): formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) else: fmt_profile = fmt.get('profile') or {} - f = { + formats.append({ 'format_id': fmt_profile.get('name-short'), 'format_note': fmt_profile.get('name'), 'url': fmt_url, @@ -35,16 +38,8 @@ class MinotoIE(InfoExtractor): 'filesize': int_or_none(fmt.get('filesize')), 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), - } - codecs = fmt.get('codecs') - if codecs: - codecs = codecs.split(',') - if len(codecs) == 2: - f.update({ - 'vcodec': codecs[0], - 'acodec': codecs[1], - }) - formats.append(f) + 'codecs': parse_codecs(fmt.get('codecs')), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/mychannels.py similarity index 59% rename from youtube_dl/extractor/makerschannel.py rename to youtube_dl/extractor/mychannels.py index f5d00e61d..b1ffe7848 100644 --- a/youtube_dl/extractor/makerschannel.py +++ b/youtube_dl/extractor/mychannels.py @@ -6,17 +6,17 @@ import re from .common import InfoExtractor -class MakersChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' +class MyChannelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' _TEST = { - 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849', - 'md5': '624a512c6969236b5967bf9286345ad1', + 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', + 'md5': 'b8993daad4262dd68d89d651c0c52c45', 'info_dict': { - 'id': '849', + 'id': 'wUUDZZep6vQD', 'ext': 'mp4', - 'title': 'Landing a bus on a plane is an epic win', - 'uploader': 'ZoomIn', - 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139', + 'title': 'Miss Holland joins VOTE LEAVE', + 'description': 'Miss Holland | #13 Not a potato', + 'uploader': 'Miss Holland', } } @@ -27,12 +27,12 @@ class MakersChannelIE(InfoExtractor): def extract_data_val(attr, fatal=False): return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) - minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') return { '_type': 'url_transparent', 'url': 'minoto:%s' % minoto_id, - 'id': extract_data_val('video-id', True), + 'id': url_id, 'title': extract_data_val('title', True), 'description': extract_data_val('description'), 'thumbnail': extract_data_val('image'), From a3f86160fa15f9e65789a73208cb50b0d82d715f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 May 2018 13:46:05 +0100 Subject: [PATCH 144/488] [pluralsight] fix clip id extraction(fixes #16460) --- youtube_dl/extractor/pluralsight.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index aacc5d4bb..3c508c9ca 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -140,10 +140,10 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to log in') - def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): + def _get_subtitles(self, author, clip_idx, lang, name, duration, video_id): captions_post = { 'a': author, - 'cn': clip_id, + 'cn': clip_idx, 'lc': lang, 'm': name, } @@ -195,13 +195,13 @@ class PluralsightIE(PluralsightBaseIE): author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] - clip_id = qs.get('clip', [None])[0] + clip_idx = qs.get('clip', [None])[0] course_name = qs.get('course', [None])[0] - if any(not f for f in (author, name, clip_id, course_name,)): + if any(not f for f in (author, name, clip_idx, course_name,)): raise ExtractorError('Invalid URL', expected=True) - display_id = '%s-%s' % (name, clip_id) + display_id = '%s-%s' % (name, clip_idx) course = self._download_course(course_name, url, display_id) @@ -217,7 +217,7 @@ class PluralsightIE(PluralsightBaseIE): clip_index = clip_.get('index') if clip_index is None: continue - if compat_str(clip_index) == clip_id: + if compat_str(clip_index) == clip_idx: clip = clip_ break @@ -225,6 +225,7 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to resolve clip') title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] QUALITIES = { 'low': {'width': 640, 'height': 480}, @@ -277,7 +278,7 @@ class PluralsightIE(PluralsightBaseIE): clip_post = { 'author': author, 'includeCaptions': False, - 'clipIndex': int(clip_id), + 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', 'moduleName': name, @@ -330,10 +331,10 @@ class PluralsightIE(PluralsightBaseIE): # TODO: other languages? subtitles = self.extract_subtitles( - author, clip_id, 'en', name, duration, display_id) + author, clip_idx, 'en', name, duration, display_id) return { - 'id': clip.get('clipName') or clip['name'], + 'id': clip_id, 'title': title, 'duration': duration, 'creator': author, From 361a965b5cd83b725560f740570d208c2a6886ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 May 2018 23:21:40 +0700 Subject: [PATCH 145/488] [vimeo:likes] Relax _VALID_URL and fix single page likes extraction (closes #16475) --- youtube_dl/extractor/vimeo.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a026526b2..8dfd8891c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -989,10 +989,10 @@ class VimeoWatchLaterIE(VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' + _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P<id>[^/]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' - _TEST = { + _TESTS = [{ 'url': 'https://vimeo.com/user755559/likes/', 'playlist_mincount': 293, 'info_dict': { @@ -1000,7 +1000,10 @@ class VimeoLikesIE(InfoExtractor): 'description': 'See all the videos urza likes', 'title': 'Videos urza likes', }, - } + }, { + 'url': 'https://vimeo.com/stormlapse/likes', + 'only_matching': True, + }] def _real_extract(self, url): user_id = self._match_id(url) @@ -1009,7 +1012,7 @@ class VimeoLikesIE(InfoExtractor): self._search_regex( r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> .*?</a></li>\s*<li\s+class="pagination_next"> - ''', webpage, 'page count'), + ''', webpage, 'page count', default=1), 'page count', fatal=True) PAGE_SIZE = 12 title = self._html_search_regex( @@ -1017,7 +1020,7 @@ class VimeoLikesIE(InfoExtractor): description = self._html_search_meta('description', webpage) def _get_page(idx): - page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % ( + page_url = 'https://vimeo.com/%s/likes/page:%d/sort:date' % ( user_id, idx + 1) webpage = self._download_webpage( page_url, user_id, @@ -1037,7 +1040,7 @@ class VimeoLikesIE(InfoExtractor): return { '_type': 'playlist', - 'id': 'user%s_likes' % user_id, + 'id': '%s_likes' % user_id, 'title': title, 'description': description, 'entries': pl, From 58197205d32ae7164303b8ac37ad1d1191a91a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 May 2018 00:30:41 +0700 Subject: [PATCH 146/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ChangeLog b/ChangeLog index ef6cc3850..37dba892e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,34 @@ +version <unreleased> + +Extractors +* [vimeo:likes] Relax URL regular expression and fix single page likes + extraction (#16475) +* [pluralsight] Fix clip id extraction (#16460) ++ [mychannels] Add support for mychannels.com (#15334) +- [moniker] Remove extractor (#15336) +* [pbs] Fix embed data extraction (#16474) ++ [mtv] Add support for paramountnetwork.com and bellator.com (#15418) +* [youtube] Fix hd720 format position +* [dailymotion] Remove fragment part from m3u8 URLs (#8915) +* [3sat] Improve extraction (#15350) + * Extract all formats + * Extract more format metadata + * Improve format sorting + * Use hls native downloader + * Detect and bypass geo-restriction ++ [dtube] Add support for d.tube (#15201) +* [options] Fix typo (#16450) +* [youtube] Improve format filesize extraction (#16453) +* [youtube] Make uploader extraction non fatal (#16444) +* [youtube] Fix extraction for embed restricted live streams (#16433) +* [nbc] Improve info extraction (#16440) +* [twitch:clips] Fix extraction (#16429) +* [redditr] Relax URL regular expression (#16426, #16427) +* [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) ++ [nick] Add support for nickjr.de (#13230) +* [teamcoco] Fix extraction (#16374) + + version 2018.05.09 Core From 7550ea501a94ed9060220cf4c8f696e514862c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 May 2018 00:32:51 +0700 Subject: [PATCH 147/488] release 2018.05.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 2 +- docs/supportedsites.md | 7 ++++--- youtube_dl/version.py | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b2bfa9ec5..7d9de5171 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.09 +[debug] youtube-dl version 2018.05.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 37dba892e..08233cd5b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.05.18 Extractors * [vimeo:likes] Relax URL regular expression and fix single page likes diff --git a/README.md b/README.md index d9fe2350a..20982b0f1 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --geo-verification-proxy URL Use this proxy to verify the IP address for some geo-restricted sites. The default proxy specified by --proxy (or none, if the - options is not present) is used for the + option is not present) is used for the actual downloading. --geo-bypass Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 88fac6e90..c1048cc4c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -100,6 +100,7 @@ - **Beatport** - **Beeg** - **BehindKink** + - **Bellator** - **BellMedia** - **Bet** - **Bigflix** @@ -234,6 +235,7 @@ - **DrTuber** - **drtv** - **drtv:live** + - **DTube** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -448,7 +450,6 @@ - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - - **MakersChannel** - **MakerTV** - **mangomolo:live** - **mangomolo:video** @@ -486,7 +487,6 @@ - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** - - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** - **MotherlessGroup** @@ -508,6 +508,7 @@ - **mva:course**: Microsoft Virtual Academy courses - **Mwave** - **MwaveMeetGreet** + - **MyChannels** - **MySpace** - **MySpace:album** - **MySpass** @@ -618,6 +619,7 @@ - **PacktPubCourse** - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV + - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) @@ -789,7 +791,6 @@ - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** - - **Spike** - **Sport5** - **SportBoxEmbed** - **SportDeutschland** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6f47b1795..a43eec860 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.09' +__version__ = '2018.05.18' From 0167f0dbfe792355e793ea82791d61fc1d05f1f9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 19 May 2018 10:15:11 +0100 Subject: [PATCH 148/488] [imdb] improve extraction(fixes #4085)(fixes #14557) --- youtube_dl/extractor/imdb.py | 107 ++++++++++++++++------------------- 1 file changed, 48 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 425421968..926c2c388 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -7,23 +7,23 @@ from ..compat import compat_str from ..utils import ( determine_ext, mimetype2ext, + parse_duration, qualities, - remove_end, ) class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'Ice Age: Continental Drift Trailer (No. 2)', - 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', + 'title': 'No. 2 from Ice Age: Continental Drift (2012)', + 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -40,82 +40,67 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', 'only_matching': True, + }, { + 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) - descr = self._html_search_regex( - r'(?s)<span itemprop="description">(.*?)</span>', - webpage, 'description', fatal=False) - player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id - player_page = self._download_webpage( - player_url, video_id, 'Downloading player page') - # the player page contains the info for the default format, we have to - # fetch other pages for the rest of the formats - extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page) - format_pages = [ - self._download_webpage( - f_url, video_id, 'Downloading info for %s format' % f_name) - for f_url, f_name in extra_formats] - format_pages.append(player_page) + webpage = self._download_webpage( + 'https://www.imdb.com/videoplayer/vi' + video_id, video_id) + video_metadata = self._parse_json(self._search_regex( + r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage, + 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id] + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'<title>(.+?)', webpage, 'title', fatal=False) or video_metadata['title'] quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] - for format_page in format_pages: - json_data = self._search_regex( - r']+class="imdb-player-data"[^>]*?>(.*?)', - format_page, 'json data', flags=re.DOTALL) - info = self._parse_json(json_data, video_id, fatal=False) - if not info: + for encoding in video_metadata.get('encodings', []): + if not encoding or not isinstance(encoding, dict): continue - format_info = info.get('videoPlayerObject', {}).get('video', {}) - if not format_info: + video_url = encoding.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): continue - video_info_list = format_info.get('videoInfoList') - if not video_info_list or not isinstance(video_info_list, list): + ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - for video_info in video_info_list: - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): - continue - if (video_info.get('videoMimeType') == 'application/x-mpegURL' or - determine_ext(video_url) == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + format_id = encoding.get('definition') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'quality': quality(format_id), + }) self._sort_formats(formats) return { 'id': video_id, - 'title': remove_end(self._og_search_title(webpage), ' - IMDb'), + 'title': title, 'formats': formats, - 'description': descr, - 'thumbnail': format_info.get('slate'), + 'description': video_metadata.get('description'), + 'thumbnail': video_metadata.get('slate', {}).get('url'), + 'duration': parse_duration(video_metadata.get('duration')), } class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/(?P[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d+)(?!/videoplayer/vi\d+)' _TEST = { - 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', + 'url': 'https://www.imdb.com/list/ls009921623/', 'info_dict': { - 'id': 'JFs9NWw6XI0', - 'title': 'March 23, 2012 Releases', + 'id': '009921623', + 'title': 'The Bourne Legacy', + 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.', }, - 'playlist_count': 7, + 'playlist_count': 8, } def _real_extract(self, url): @@ -123,9 +108,13 @@ class ImdbListIE(InfoExtractor): webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') - for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)] + for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)] list_title = self._html_search_regex( - r'

(.*?)

', webpage, 'list title') + r']+class="[^"]*header[^"]*"[^>]*>(.*?)', + webpage, 'list title') + list_description = self._html_search_regex( + r']+class="[^"]*list-description[^"]*"[^>]*>

(.*?)

', + webpage, 'list description') - return self.playlist_result(entries, list_id, list_title) + return self.playlist_result(entries, list_id, list_title, list_description) From 27694fe7ad77d5f99d7b46fa7395f4ccbb378777 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 May 2018 11:04:08 +0100 Subject: [PATCH 149/488] [imdb:list] fix _VALID_URL regex --- youtube_dl/extractor/imdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 926c2c388..4bafa54a2 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -92,7 +92,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d+)(?!/videoplayer/vi\d+)' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d{9})(?!/videoplayer/vi\d+)' _TEST = { 'url': 'https://www.imdb.com/list/ls009921623/', 'info_dict': { From acd620c930a92511c2e2099a4fc82d41825fdf93 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 May 2018 12:19:05 +0100 Subject: [PATCH 150/488] [teamcoco] improve _VALID_URL regex(#16484) --- youtube_dl/extractor/teamcoco.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index f06e5b19a..64235b0f6 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..utils import ( class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P[^/?#]+)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { 'url': 'http://teamcoco.com/video/mary-kay-remote', @@ -67,6 +67,9 @@ class TeamcocoIE(InfoExtractor): 'skip_download': True, # m3u8 downloads }, 'skip': 'This video is no longer available.', + }, { + 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', + 'only_matching': True, } ] From f2b1fa07ec063ca63373e8558223e7af544f2cf8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 May 2018 13:05:51 +0100 Subject: [PATCH 151/488] [teamcoco] relax _VALID_URL regex and add a fallback for format extraction(fixes #16484) --- youtube_dl/extractor/teamcoco.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 64235b0f6..63fd4fe1c 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..utils import ( class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P([^/]+/)*[^/?#]+)' + _VALID_URL = r'https?://teamcoco\.com/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { 'url': 'http://teamcoco.com/video/mary-kay-remote', @@ -70,6 +70,15 @@ class TeamcocoIE(InfoExtractor): }, { 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/italy/conan-jordan-schlansky-hit-the-streets-of-florence', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/haiti/conan-s-haitian-history-lesson', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv', + 'only_matching': True, } ] @@ -84,7 +93,7 @@ class TeamcocoIE(InfoExtractor): display_id = self._match_id(url) response = self._graphql_call('''{ - %s(slug: "video/%s") { + %s(slug: "%s") { ... on RecordSlug { record { id @@ -94,6 +103,9 @@ class TeamcocoIE(InfoExtractor): thumb { preview } + file { + url + } tags { name } @@ -111,15 +123,15 @@ class TeamcocoIE(InfoExtractor): record = response['record'] video_id = record['id'] - srcs = self._graphql_call('''{ + video_sources = self._graphql_call('''{ %s(id: "%s") { src } -}''', 'RecordVideoSource', video_id)['src'] +}''', 'RecordVideoSource', video_id) or {} formats = [] get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in srcs.items(): + for format_id, src in video_sources.get('src', {}).items(): if not isinstance(src, dict): continue src_url = src.get('src') @@ -146,6 +158,9 @@ class TeamcocoIE(InfoExtractor): 'format_id': format_id, 'quality': get_quality(format_id), }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) self._sort_formats(formats) return { From 504f20dd302189db2cfe1cb5ee9a622c39ee693c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 May 2018 23:53:24 +0700 Subject: [PATCH 152/488] Remove experimental mark for some options --- youtube_dl/YoutubeDL.py | 10 +++++----- youtube_dl/downloader/common.py | 1 - youtube_dl/extractor/common.py | 9 +++------ youtube_dl/options.py | 12 ++++++------ 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 046e03247..2a405c5ca 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -211,7 +211,7 @@ class YoutubeDL(object): At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification - on geo-restricted sites. (Experimental) + on geo-restricted sites. socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -259,7 +259,7 @@ class YoutubeDL(object): - "warn": only emit a warning - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) - source_address: (Experimental) Client-side IP address to bind to. + source_address: Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. sleep_interval: Number of seconds to sleep before each download when @@ -281,14 +281,14 @@ class YoutubeDL(object): match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For - HTTP header (experimental) + HTTP header geo_bypass_country: Two-letter ISO 3166-2 country code that will be used for explicit geographic restriction bypassing via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header geo_bypass_ip_block: IP range in CIDR notation that will be used similarly to - geo_bypass_country (experimental) + geo_bypass_country The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index edd125ee2..5979833c0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -45,7 +45,6 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - (experimental) external_downloader_args: A list of additional command-line arguments for the external downloader. hls_use_mpegts: Use the mpegts container for HLS videos. diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3ef5af13c..a2548dba3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -339,20 +339,17 @@ class InfoExtractor(object): _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. (experimental) + country code provided with geo_bypass_country. _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted countries for this extractor. One of these countries will be used by geo restriction bypass mechanism right away in order to bypass - geo restriction, of course, if the mechanism is not disabled. (experimental) + geo restriction, of course, if the mechanism is not disabled. _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted IP blocks in CIDR notation for this extractor. One of these IP blocks will be used by geo restriction bypass mechanism similarly - to _GEO_COUNTRIES. (experimental) - - NB: both these geo attributes are experimental and may change in future - or be completely removed. + to _GEO_COUNTRIES. Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. diff --git a/youtube_dl/options.py b/youtube_dl/options.py index b692c6b3b..e83d546a0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -203,7 +203,7 @@ def parseOpts(overrideArguments=None): network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', - help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental ' + help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' 'SOCKS proxy, specify a proper scheme. For example ' 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' 'for direct connection') @@ -240,19 +240,19 @@ def parseOpts(overrideArguments=None): geo.add_option( '--geo-bypass', action='store_true', dest='geo_bypass', default=True, - help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--no-geo-bypass', action='store_false', dest='geo_bypass', default=True, - help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--geo-bypass-country', metavar='CODE', dest='geo_bypass_country', default=None, - help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') geo.add_option( '--geo-bypass-ip-block', metavar='IP_BLOCK', dest='geo_bypass_ip_block', default=None, - help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation (experimental)') + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -502,7 +502,7 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected file size (experimental)') + help='Set file xattribute ytdl.filesize with expected file size') downloader.add_option( '--hls-prefer-native', dest='hls_prefer_native', action='store_true', default=None, From 5c766952dc6de9065060344342184c4037403409 Mon Sep 17 00:00:00 2001 From: huichen90 <35417991+huichen90@users.noreply.github.com> Date: Wed, 16 May 2018 17:29:25 +0800 Subject: [PATCH 153/488] Update leeco.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed this bug :youtube_dl.utils.ExtractorError: An extractor error has occurred. (caused by KeyError('location',)); --- youtube_dl/extractor/leeco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index ffe10154b..8dd1ce0d0 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -130,7 +130,7 @@ class LeIE(InfoExtractor): media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, - 'splatid': 101, + 'splatid': 105, 'format': 1, 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), From db2058f63e64ff59ffad0e1e8ad5e18d18d3da71 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 14:53:02 +0100 Subject: [PATCH 154/488] [globo] improve extraction(closes #4189) - add support for authentication - simplify url signing - extract DASH and MSS formats --- youtube_dl/extractor/globo.py | 321 +++++----------------------------- 1 file changed, 41 insertions(+), 280 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dc7b2661c..730deda6b 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -1,16 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import hashlib +import json import random import re -import math from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_chr, - compat_ord, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -22,12 +20,7 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' - - _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' - _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' - - _RESIGN_EXPIRATION = 86400 - + _LOGGED_IN = False _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', @@ -70,287 +63,49 @@ class GloboIE(InfoExtractor): 'only_matching': True, }] - class MD5(object): - HEX_FORMAT_LOWERCASE = 0 - HEX_FORMAT_UPPERCASE = 1 - BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' - BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' - PADDING = '=0xFF01DD' - hexcase = 0 - b64pad = '' + def _real_initialize(self): + if self._LOGGED_IN: + return - def __init__(self): - pass + email, password = self._get_login_info() + if email is None: + return - class JSArray(list): - def __getitem__(self, y): - try: - return list.__getitem__(self, y) - except IndexError: - return 0 - - def __setitem__(self, i, y): - try: - return list.__setitem__(self, i, y) - except IndexError: - self.extend([0] * (i - len(self) + 1)) - self[-1] = y - - @classmethod - def hex_md5(cls, param1): - return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) - - @classmethod - def b64_md5(cls, param1, param2=None): - return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) - - @classmethod - def any_md5(cls, param1, param2): - return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) - - @classmethod - def rstr_md5(cls, param1): - return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) - - @classmethod - def rstr2hex(cls, param1): - _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' - _loc_3 = '' - for _loc_5 in range(0, len(param1)): - _loc_4 = compat_ord(param1[_loc_5]) - _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] - return _loc_3 - - @classmethod - def rstr2b64(cls, param1): - _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' - _loc_3 = '' - _loc_4 = len(param1) - for _loc_5 in range(0, _loc_4, 3): - _loc_6_1 = compat_ord(param1[_loc_5]) << 16 - _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 - _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 - _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 - for _loc_7 in range(0, 4): - if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: - _loc_3 += cls.b64pad - else: - _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] - return _loc_3 - - @staticmethod - def rstr2any(param1, param2): - _loc_3 = len(param2) - _loc_4 = [] - _loc_9 = [0] * ((len(param1) >> 2) + 1) - for _loc_5 in range(0, len(_loc_9)): - _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) - - while len(_loc_9) > 0: - _loc_8 = [] - _loc_7 = 0 - for _loc_5 in range(0, len(_loc_9)): - _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] - _loc_6 = math.floor(_loc_7 / _loc_3) - _loc_7 -= _loc_6 * _loc_3 - if len(_loc_8) > 0 or _loc_6 > 0: - _loc_8[len(_loc_8)] = _loc_6 - - _loc_4[len(_loc_4)] = _loc_7 - _loc_9 = _loc_8 - - _loc_10 = '' - _loc_5 = len(_loc_4) - 1 - while _loc_5 >= 0: - _loc_10 += param2[_loc_4[_loc_5]] - _loc_5 -= 1 - - return _loc_10 - - @classmethod - def str2rstr_utf8(cls, param1, param2=None): - _loc_3 = '' - _loc_4 = -1 - if not param2: - param2 = cls.PADDING - param1 = param1 + param2[1:9] - while True: - _loc_4 += 1 - if _loc_4 >= len(param1): - break - _loc_5 = compat_ord(param1[_loc_4]) - _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 - if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: - _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) - _loc_4 += 1 - if _loc_5 <= 127: - _loc_3 += compat_chr(_loc_5) - continue - if _loc_5 <= 2047: - _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) - continue - if _loc_5 <= 65535: - _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( - 128 | _loc_5 & 63) - continue - if _loc_5 <= 2097151: - _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( - 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) - return _loc_3 - - @staticmethod - def rstr2binl(param1): - _loc_2 = [0] * ((len(param1) >> 2) + 1) - for _loc_3 in range(0, len(_loc_2)): - _loc_2[_loc_3] = 0 - for _loc_3 in range(0, len(param1) * 8, 8): - _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 - return _loc_2 - - @staticmethod - def binl2rstr(param1): - _loc_2 = '' - for _loc_3 in range(0, len(param1) * 32, 8): - _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) - return _loc_2 - - @classmethod - def binl_md5(cls, param1, param2): - param1 = cls.JSArray(param1) - param1[param2 >> 5] |= 128 << param2 % 32 - param1[(param2 + 64 >> 9 << 4) + 14] = param2 - _loc_3 = 1732584193 - _loc_4 = -271733879 - _loc_5 = -1732584194 - _loc_6 = 271733878 - for _loc_7 in range(0, len(param1), 16): - _loc_8 = _loc_3 - _loc_9 = _loc_4 - _loc_10 = _loc_5 - _loc_11 = _loc_6 - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) - _loc_3 = cls.safe_add(_loc_3, _loc_8) - _loc_4 = cls.safe_add(_loc_4, _loc_9) - _loc_5 = cls.safe_add(_loc_5, _loc_10) - _loc_6 = cls.safe_add(_loc_6, _loc_11) - return [_loc_3, _loc_4, _loc_5, _loc_6] - - @classmethod - def md5_cmn(cls, param1, param2, param3, param4, param5, param6): - return cls.safe_add( - cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) - - @classmethod - def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) - - @classmethod - def safe_add(cls, param1, param2): - _loc_3 = (param1 & 65535) + (param2 & 65535) - _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) - return cls.lshift(_loc_4, 16) | _loc_3 & 65535 - - @classmethod - def bit_rol(cls, param1, param2): - return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) - - @staticmethod - def lshift(value, count): - r = (0xFFFFFFFF & value) << count - return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + self._download_json( + 'https://login.globo.com/api/authentication', None, data=json.dumps({ + 'payload': { + 'email': email, + 'password': password, + 'serviceId': 4654, + }, + }).encode(), headers={ + 'Content-Type': 'application/json; charset=utf-8', + }) + self._LOGGED_IN = True def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + 'http://api.globovideos.com/videos/%s/playlist' % video_id, + video_id)['videos'][0] title = video['title'] formats = [] for resource in video['resources']: resource_id = resource.get('_id') - if not resource_id or resource_id.endswith('manifest'): + resource_url = resource.get('url') + if not resource_id or not resource_url: continue security = self._download_json( - self._SECURITY_URL_TEMPLATE % (video_id, resource_id), - video_id, 'Downloading security hash for %s' % resource_id) + 'http://security.video.globo.com/videos/%s/hash' % video_id, + video_id, 'Downloading security hash for %s' % resource_id, query={ + 'player': 'flash', + 'version': '17.0.0.132', + 'resource_id': resource_id, + }) security_hash = security.get('hash') if not security_hash: @@ -365,18 +120,24 @@ class GloboIE(InfoExtractor): received_random = security_hash[12:22] received_md5 = security_hash[22:] - sign_time = received_time + self._RESIGN_EXPIRATION + sign_time = received_time + 86400 padding = '%010d' % random.randint(1, 10000000000) - signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) + md5_data = (received_md5 + str(sign_time) + padding + '0xFF01DD').encode() + signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 - resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats( + signed_url, resource_id, mpd_id='dash', fatal=False)) + elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'): + formats.extend(self._extract_ism_formats( + signed_url, resource_id, ism_id='mss', fatal=False)) else: formats.append({ 'url': signed_url, From e5187493002f1d089d450fc3b2b4af64c996dc71 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 15:07:24 +0100 Subject: [PATCH 155/488] [globo] handle login errors --- youtube_dl/extractor/globo.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 730deda6b..9c2360464 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -8,7 +8,10 @@ import random import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, float_or_none, @@ -71,16 +74,22 @@ class GloboIE(InfoExtractor): if email is None: return - self._download_json( - 'https://login.globo.com/api/authentication', None, data=json.dumps({ - 'payload': { - 'email': email, - 'password': password, - 'serviceId': 4654, - }, - }).encode(), headers={ - 'Content-Type': 'application/json; charset=utf-8', - }) + try: + self._download_json( + 'https://login.globo.com/api/authentication', None, data=json.dumps({ + 'payload': { + 'email': email, + 'password': password, + 'serviceId': 4654, + }, + }).encode(), headers={ + 'Content-Type': 'application/json; charset=utf-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read(), None) + raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) + raise self._LOGGED_IN = True def _real_extract(self, url): From d81ffc3aa0f7b4114cec68cac9e347689a6d5462 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 15:39:02 +0100 Subject: [PATCH 156/488] [globo] Add entry for netrc authentication --- youtube_dl/extractor/globo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 9c2360464..8e6c38742 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -24,6 +24,7 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' _LOGGED_IN = False + _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', From b89ac534555692c3c29a57d97ec0bda3bef3b086 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 17:46:52 +0100 Subject: [PATCH 157/488] [globo] use compat_str --- youtube_dl/extractor/globo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8e6c38742..81d6d36d3 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -126,16 +126,16 @@ class GloboIE(InfoExtractor): continue hash_code = security_hash[:2] - received_time = int(security_hash[2:12]) + received_time = security_hash[2:12] received_random = security_hash[12:22] received_md5 = security_hash[22:] - sign_time = received_time + 86400 + sign_time = compat_str(int(received_time) + 86400) padding = '%010d' % random.randint(1, 10000000000) - md5_data = (received_md5 + str(sign_time) + padding + '0xFF01DD').encode() + md5_data = (received_md5 + sign_time + padding + '0xFF01DD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + signed_hash = hash_code + received_time + received_random + sign_time + padding + signed_md5 signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): From 57d6792024f2670a21f923dfbd81614a1ee6b735 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 11:27:36 +0100 Subject: [PATCH 158/488] [viewlift] fix extraction for snagfils.com(closes #15766) --- youtube_dl/extractor/viewlift.py | 164 ++++++++++++++++++++++--------- 1 file changed, 115 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 1f29c273f..e466156f6 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,24 +1,27 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, clean_html, determine_ext, int_or_none, js_to_json, + parse_age_limit, parse_duration, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com' class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -60,8 +63,10 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): formats = [] has_bitrate = False - for source in self._parse_json(js_to_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, + 'sources', default='[]'), video_id, js_to_json) + for source in sources: file_ = source.get('file') if not file_: continue @@ -70,7 +75,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): format_id = source.get('label') or ext if all(v in ('m3u8', 'hls') for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', m3u8_id='hls')) + file_, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], @@ -85,6 +91,13 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'tbr': bitrate, 'height': height, }) + if not formats: + hls_url = self._parse_json(self._search_regex( + r'filmInfo\.src\s*=\s*({.+?});', + webpage, 'src'), video_id, js_to_json)['src'] + formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') self._sort_formats(formats, field_preference) @@ -109,10 +122,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'display_id': 'lost_for_life', 'ext': 'mp4', 'title': 'Lost for Life', - 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 4489, - 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + 'categories': 'mincount:3', + 'age_limit': 14, + 'upload_date': '20150421', + 'timestamp': 1429656819, } }, { 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', @@ -125,7 +141,9 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 979, - 'categories': ['Documentary', 'Sports', 'Politics'] + 'categories': 'mincount:2', + 'timestamp': 1399478279, + 'upload_date': '20140507', } }, { # Film is not playable in your area. @@ -138,9 +156,6 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'http://www.winnersview.com/videos/the-good-son', 'only_matching': True, - }, { - 'url': 'http://www.kesari.tv/news/video/1461919076414', - 'only_matching': True, }, { # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', @@ -156,45 +171,96 @@ class ViewLiftIE(ViewLiftBaseIE): raise ExtractorError( 'Film %s is not available.' % display_id, expected=True) - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + initial_store_state = self._search_regex( + r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", + webpage, 'Initial Store State', default=None) + if initial_store_state: + modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( + initial_store_state).decode()), display_id)['page']['data']['modules'] + content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') + gist = content_data['gist'] + film_id = gist['id'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), - display_id) + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) + + info = { + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating', '').replace('_', '-')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info else: - title = self._search_regex( - r'itemprop="title">([^<]+)<', webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.+?)
', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + snag = self._parse_json( + self._search_regex( + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)
(.+?)
', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + 'ie_key': 'ViewLiftEmbed', + } From b836118724122a639a1cb78d55d91724bf1e7251 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:12:20 +0100 Subject: [PATCH 159/488] [utils] Relax TV Parental Guidelines matching --- youtube_dl/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f9ca63c58..d61af8837 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2253,12 +2253,12 @@ US_RATINGS = { TV_PARENTAL_GUIDELINES = { - 'TV-Y': 0, - 'TV-Y7': 7, - 'TV-G': 0, - 'TV-PG': 0, - 'TV-14': 14, - 'TV-MA': 17, + 'Y': 0, + 'Y7': 7, + 'G': 0, + 'PG': 0, + '14': 14, + 'MA': 17, } @@ -2272,7 +2272,10 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - return TV_PARENTAL_GUIDELINES.get(s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(TV_PARENTAL_GUIDELINES.keys()), s) + if m: + return TV_PARENTAL_GUIDELINES[m.group(1)] + return None def strip_jsonp(code): From 670dcba8c73ee69545513522676b2c480bc48662 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:13:44 +0100 Subject: [PATCH 160/488] [viewlift] Remove rating format transformation --- youtube_dl/extractor/viewlift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index e466156f6..51a002b11 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -214,7 +214,7 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': gist.get('description'), 'thumbnail': gist.get('videoImageUrl'), 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating', '').replace('_', '-')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, } From 268e132dec96ea9e8a9a3cafb788baf39a498c7d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:15:21 +0100 Subject: [PATCH 161/488] [go90] extract age limit and detect drm protection(#10127) --- youtube_dl/extractor/go90.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 9b2e1c164..35dde42d0 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + ExtractorError, int_or_none, + parse_age_limit, parse_iso8601, ) @@ -23,6 +25,7 @@ class Go90IE(InfoExtractor): 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', 'timestamp': 1491868800, 'upload_date': '20170411', + 'age_limit': 14, } } @@ -33,6 +36,8 @@ class Go90IE(InfoExtractor): video_id, headers={ 'Content-Type': 'application/json; charset=utf-8', }, data=b'{"client":"web","device_type":"pc"}') + if video_data.get('requires_drm'): + raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] episode_number = int_or_none(video_data.get('episode_number')) @@ -123,4 +128,5 @@ class Go90IE(InfoExtractor): 'season_number': season_number, 'episode_number': episode_number, 'subtitles': subtitles, + 'age_limit': parse_age_limit(video_data.get('rating')), } From 3bb3ff38a15ccf00686c75af8d6635903632ee87 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:20:05 +0100 Subject: [PATCH 162/488] [test_utils] add tests for b836118724122a639a1cb78d55d91724bf1e7251 --- test/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 14503ab53..f2b51131c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -519,6 +519,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_age_limit('PG-13'), 13) self.assertEqual(parse_age_limit('TV-14'), 14) self.assertEqual(parse_age_limit('TV-MA'), 17) + self.assertEqual(parse_age_limit('TV14'), 14) + self.assertEqual(parse_age_limit('TV_G'), 0) def test_parse_duration(self): self.assertEqual(parse_duration(None), None) From ca0aef42d4fa77123c56c19ef3fe2673645391a2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 23:04:12 +0100 Subject: [PATCH 163/488] [viewlift] add support for hoichoi.tv(closes #16536) --- youtube_dl/extractor/viewlift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 51a002b11..c43d1a1e8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -17,7 +17,7 @@ from ..utils import ( class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): From 1139935db78b610d15ade2b667e2a07b4df0ecf0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 24 May 2018 02:51:47 +0100 Subject: [PATCH 164/488] [nbc] add support for stream.nbcsports.com(closes #13911) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nbc.py | 62 +++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7d5927131..52e330955 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -666,6 +666,7 @@ from .nbc import ( NBCOlympicsIE, NBCOlympicsStreamIE, NBCSportsIE, + NBCSportsStreamIE, NBCSportsVPlayerIE, ) from .ndr import ( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1b1722cfa..c843f8649 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,7 +1,8 @@ from __future__ import unicode_literals -import re import base64 +import json +import re from .common import InfoExtractor from .theplatform import ThePlatformIE @@ -175,6 +176,65 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') +class NBCSportsStreamIE(AdobePassIE): + _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P\d+)' + _TEST = { + 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559', + 'info_dict': { + 'id': '206559', + 'ext': 'mp4', + 'title': 'Amgen Tour of California Women\'s Recap', + 'description': 'md5:66520066b3b5281ada7698d0ea2aa894', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires Adobe Pass Authentication', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + live_source = self._download_json( + 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, + video_id) + video_source = live_source['videoSources'][0] + title = video_source['title'] + source_url = None + for k in ('source', 'msl4source', 'iossource', 'hlsv4'): + sk = k + 'Url' + source_url = video_source.get(sk) or video_source.get(sk + 'Alt') + if source_url: + break + else: + source_url = video_source['ottStreamUrl'] + is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' + resource = self._get_mvpd_resource('nbcsports', title, video_id, '') + token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) + tokenized_url = self._download_json( + 'https://token.playmakerservices.com/cdn', + video_id, data=json.dumps({ + 'requestorId': 'nbcsports', + 'pid': video_id, + 'application': 'NBCSports', + 'version': 'v1', + 'platform': 'desktop', + 'cdn': 'akamai', + 'url': video_source['sourceUrl'], + 'token': base64.b64encode(token.encode()).decode(), + 'resourceId': base64.b64encode(resource.encode()).decode(), + }).encode())['tokenizedUrl'] + formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': live_source.get('description'), + 'formats': formats, + 'is_live': is_live, + } + + class CSNNEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' From e8e58c22786918f93e6928d86b878fdc56461c4d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 24 May 2018 11:53:42 +0100 Subject: [PATCH 165/488] [hidive] add support for authentication(closes #16534) --- youtube_dl/extractor/hidive.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index eee517071..d8f2e682f 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -17,6 +17,9 @@ class HiDiveIE(InfoExtractor): # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, # so disabling geo bypass completely _GEO_BYPASS = False + _NETRC_MACHINE = 'hidive' + _LOGGED_IN = False + _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', @@ -31,8 +34,30 @@ class HiDiveIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires Authentication', }] + def _real_initialize(self): + if self._LOGGED_IN: + return + + (email, password) = self._get_login_info() + if email is None: + return + + webpage = self._download_webpage(self._LOGIN_URL, None) + form = self._search_regex( + r'(?s)]+action="/account/login"[^>]*>(.+?)', + webpage, 'login form') + data = self._hidden_inputs(form) + data.update({ + 'Email': email, + 'Password': password, + }) + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) + self._LOGGED_IN = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title, key = mobj.group('title', 'key') @@ -43,6 +68,7 @@ class HiDiveIE(InfoExtractor): data=urlencode_postdata({ 'Title': title, 'Key': key, + 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', })) restriction = settings.get('restrictionReason') @@ -79,6 +105,7 @@ class HiDiveIE(InfoExtractor): subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, }) + self._sort_formats(formats) season_number = int_or_none(self._search_regex( r's(\d+)', key, 'season number', default=None)) From 3d2a643fdcba126b209b758f2e403742ee631cf3 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Thu, 24 May 2018 11:15:03 +0200 Subject: [PATCH 166/488] [imgur] Fix extraction --- youtube_dl/extractor/imgur.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 67c24a51c..2901960a5 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -21,7 +20,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,7 +28,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/gallery/YcAQlkx', @@ -37,8 +36,6 @@ class ImgurIE(InfoExtractor): 'id': 'YcAQlkx', 'ext': 'mp4', 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - 'description': 'Imgur: The most awesome images on the Internet.' - } }, { 'url': 'http://imgur.com/topic/Funny/N8rOudd', @@ -50,8 +47,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - compat_urlparse.urljoin(url, video_id), video_id) + gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) + webpage = self._download_webpage(gifv_url, video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -107,7 +104,7 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'title': self._og_search_title(webpage), } From c561b75c82247188e010b6b53c118bb26b4daaf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:09:15 +0700 Subject: [PATCH 167/488] [peertube] Add extractor (closes #16301, closes #16329) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/peertube.py | 210 +++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 youtube_dl/extractor/peertube.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 52e330955..374aa185c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -811,6 +811,7 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE from .periscope import ( diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py new file mode 100644 index 000000000..b086f6f5a --- /dev/null +++ b/youtube_dl/extractor/peertube.py @@ -0,0 +1,210 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_resolution, + try_get, + unified_timestamp, + urljoin, +) + + +class PeerTubeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + # Taken from https://instances.joinpeertube.org/instances + tube\.openalgeria\.org| + peertube\.pointsecu\.fr| + peertube\.nogafa\.org| + peertube\.pl| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.inapurna\.org| + peertube\.netzspielplatz\.de| + video\.deadsuperhero\.com| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.worldofhauru\.xyz| + tube\.bootlicker\.party| + skeptikon\.fr| + peertube\.geekshell\.fr| + tube\.opportunis\.me| + peertube\.peshane\.net| + video\.blueline\.mg| + tube\.homecomputing\.fr| + videos\.cloudfrancois\.fr| + peertube\.viviers-fibre\.net| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + peertube\.extremely\.online| + peertube\.public-infrastructure\.eu| + tube\.kher\.nl| + peertube\.qtg\.fr| + tube\.22decembre\.eu| + facegirl\.me| + video\.migennes\.net| + janny\.moe| + tube\.p2p\.legal| + video\.atlanti\.se| + troll\.tv| + peertube\.geekael\.fr| + vid\.leotindall\.com| + video\.anormallostpod\.ovh| + p-tube\.h3z\.jp| + tube\.darfweb\.eu| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.symphonie-of-code\.fr| + testtube\.ortg\.de| + videos\.cemea\.org| + peertube\.gwendalavir\.eu| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + peertube\.duckdns\.org| + sikke\.fi| + peertube\.mastodon\.host| + firedragonvideos\.com| + vidz\.dou\.bet| + peertube\.koehn\.com| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + medias\.libox\.fr| + peertube\.moe| + peertube\.xyz| + jp\.peertube\.network| + videos\.benpro\.fr| + tube\.otter\.sh| + peertube\.angristan\.xyz| + peertube\.parleur\.net| + peer\.ecutsa\.fr| + peertube\.heraut\.eu| + peertube\.tifox\.fr| + peertube\.maly\.io| + vod\.mochi\.academy| + exode\.me| + coste\.video| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + meilleurtube\.delire\.party| + tube\.mochi\.academy| + peertube\.dav\.li| + media\.zat\.im| + pytu\.be| + peertube\.valvin\.fr| + peertube\.nsa\.ovh| + video\.colibris-outilslibres\.org| + video\.hispagatos\.org| + tube\.svnet\.fr| + peertube\.video| + videos\.lecygnenoir\.info| + peertube3\.cpy\.re| + peertube2\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re + ) + /videos/watch/(?P[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'md5': '80f24ff364cc9d333529506a263e7feb', + 'info_dict': { + 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'ext': 'mp4', + 'title': 'wow', + 'description': 'wow such video, so gif', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1519297480, + 'upload_date': '20180222', + 'uploader': 'Luclu7', + 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', + 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', + 'license': 'Unknown', + 'duration': 3, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': list, + 'categories': list, + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + urljoin(url, '/api/v1/videos/%s' % video_id), video_id) + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = file_.get('fileUrl') + if not file_url or not isinstance(file_url, compat_str): + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + def account_data(field): + return try_get(video, lambda x: x['account'][field], compat_str) + + category = try_get(video, lambda x: x['category']['label'], compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName'), + 'uploader_id': account_data('uuid'), + 'uploder_url': account_data('url'), + 'license': try_get( + video, lambda x: x['licence']['label'], compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + } From f2fc63a5a873391b9ac15642507a2eae71e42906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:15:38 +0700 Subject: [PATCH 168/488] [peertube] Add support for embed and API URLs --- youtube_dl/extractor/peertube.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index b086f6f5a..61c41add0 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -116,7 +116,8 @@ class PeerTubeIE(InfoExtractor): videos\.tcit\.fr| peertube\.cpy\.re ) - /videos/watch/(?P[^/?#&]+) + /(?:videos/(?:watch|embed)|api/v\d/videos)/ + (?P[^/?#&]+) ''' _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', @@ -147,6 +148,12 @@ class PeerTubeIE(InfoExtractor): # nsfw 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, }] def _real_extract(self, url): From 6bd499e8ca769cf69c4b24fa2d7a751d7869b679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:28:30 +0700 Subject: [PATCH 169/488] [peertube] Add support for generic embeds --- youtube_dl/extractor/generic.py | 15 +++++++++++++++ youtube_dl/extractor/peertube.py | 23 +++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 76852f9dc..47ac139c9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -108,6 +108,7 @@ from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE +from .peertube import PeerTubeIE class GenericIE(InfoExtractor): @@ -2012,6 +2013,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # PeerTube embed + 'url': 'https://joinpeertube.org/fr/home/', + 'info_dict': { + 'id': 'home', + 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', + }, + 'playlist_count': 2, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3029,6 +3039,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + peertube_urls = PeerTubeIE._extract_urls(webpage) + if peertube_urls: + return self.playlist_from_matches( + peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 61c41add0..a481b3151 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -13,9 +15,7 @@ from ..utils import ( class PeerTubeIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: + _INSTANCES_RE = r'''(?: # Taken from https://instances.joinpeertube.org/instances tube\.openalgeria\.org| peertube\.pointsecu\.fr| @@ -115,10 +115,13 @@ class PeerTubeIE(InfoExtractor): peertube2\.cpy\.re| videos\.tcit\.fr| peertube\.cpy\.re - ) + )''' + _VALID_URL = r'''(?x) + https?:// + %s /(?:videos/(?:watch|embed)|api/v\d/videos)/ - (?P[^/?#&]+) - ''' + (?P[^/?\#&]+) + ''' % _INSTANCES_RE _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', @@ -156,6 +159,14 @@ class PeerTubeIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'''(?x)]+\bsrc=(["\'])(?P(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' + % PeerTubeIE._INSTANCES_RE, webpage)] + def _real_extract(self, url): video_id = self._match_id(url) From b39f42ee92a3cd669da24db9798e1dc9b574720f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A1s=20Veres-Szentkir=C3=A1lyi?= Date: Fri, 25 May 2018 19:46:05 +0200 Subject: [PATCH 170/488] [indavideo] Sign download URLs --- youtube_dl/extractor/indavideo.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 11cf3c609..15b766fb2 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -2,10 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + update_url_query, ) @@ -58,11 +60,10 @@ class IndavideoEmbedIE(InfoExtractor): if flv_url not in video_urls: video_urls.append(flv_url) - formats = [{ - 'url': video_url, - 'height': int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)), - } for video_url in video_urls] + filesh = video.get('filesh') + formats = [ + self.video_url_to_format(video_url, filesh) + for video_url in video_urls] self._sort_formats(formats) timestamp = video.get('date') @@ -90,6 +91,18 @@ class IndavideoEmbedIE(InfoExtractor): 'formats': formats, } + def video_url_to_format(self, video_url, filesh): + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if height and filesh: + token = filesh.get(compat_str(height)) + if token is not None: + video_url = update_url_query(video_url, {'token': token}) + return { + 'url': video_url, + 'height': height, + } + class IndavideoIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' From 2a7c6befc16f72df5368cb4adccd1cd84fd432d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:09:44 +0700 Subject: [PATCH 171/488] [indavideo] Fix extraction (closes #11221) --- youtube_dl/extractor/indavideo.py | 48 +++++++++++++++++++------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 15b766fb2..2946c7b84 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -15,7 +15,7 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P[\da-f]+)' _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', - 'md5': 'f79b009c66194acacd40712a6778acfa', + 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', 'ext': 'mp4', @@ -47,7 +47,14 @@ class IndavideoEmbedIE(InfoExtractor): title = video['title'] - video_urls = video.get('video_files', []) + video_urls = [] + + video_files = video.get('video_files') + if isinstance(video_files, list): + video_urls.extend(video_files) + elif isinstance(video_files, dict): + video_urls.extend(video_files.values()) + video_file = video.get('video_file') if video: video_urls.append(video_file) @@ -61,9 +68,22 @@ class IndavideoEmbedIE(InfoExtractor): video_urls.append(flv_url) filesh = video.get('filesh') - formats = [ - self.video_url_to_format(video_url, filesh) - for video_url in video_urls] + + formats = [] + for video_url in video_urls: + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if filesh: + if not height: + continue + token = filesh.get(compat_str(height)) + if token is None: + continue + video_url = update_url_query(video_url, {'token': token}) + formats.append({ + 'url': video_url, + 'height': height, + }) self._sort_formats(formats) timestamp = video.get('date') @@ -91,18 +111,6 @@ class IndavideoEmbedIE(InfoExtractor): 'formats': formats, } - def video_url_to_format(self, video_url, filesh): - height = int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) - if height and filesh: - token = filesh.get(compat_str(height)) - if token is not None: - video_url = update_url_query(video_url, {'token': token}) - return { - 'url': video_url, - 'height': height, - } - class IndavideoIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' @@ -122,7 +130,7 @@ class IndavideoIE(InfoExtractor): 'upload_date': '20140127', 'duration': 7, 'age_limit': 0, - 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], + 'tags': list, }, }, { 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', @@ -146,7 +154,9 @@ class IndavideoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) embed_url = self._search_regex( - r']+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') + (r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.indavideo\.hu/player/video/.+?)\1', + r']+rel="video_src"[^>]+href="(?P.+?)"'), + webpage, 'embed url', group='url') return { '_type': 'url_transparent', From aee36ca832ec3a5696c40707098d97be0353e997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:25:40 +0700 Subject: [PATCH 172/488] [indavideo] Add support for generic embeds (closes #11989) --- youtube_dl/extractor/extractors.py | 5 +-- youtube_dl/extractor/generic.py | 24 ++++++++++ youtube_dl/extractor/indavideo.py | 70 +++++++----------------------- 3 files changed, 41 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 374aa185c..c9b49a0cd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -469,10 +469,7 @@ from .imgur import ( ) from .ina import InaIE from .inc import IncIE -from .indavideo import ( - IndavideoIE, - IndavideoEmbedIE, -) +from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internazionale import InternazionaleIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 47ac139c9..0292e0458 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -109,6 +109,7 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .indavideo import IndavideoEmbedIE class GenericIE(InfoExtractor): @@ -2022,6 +2023,24 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 2, }, + { + # Indavideo embed + 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', + 'info_dict': { + 'id': '1693903', + 'ext': 'mp4', + 'title': 'Így kell otthon hamburgert sütni', + 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', + 'timestamp': 1426330212, + 'upload_date': '20150314', + 'uploader': 'StreetKitchen', + 'uploader_id': '546363', + }, + 'add_ie': [IndavideoEmbedIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3044,6 +3063,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) + if indavideo_urls: + return self.playlist_from_matches( + indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 2946c7b84..2b5b2b5b0 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -38,6 +40,20 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\'](?P(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) @@ -110,57 +126,3 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': tags, 'formats': formats, } - - -class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' - _TESTS = [{ - 'url': 'http://indavideo.hu/video/Vicces_cica_1', - 'md5': '8c82244ba85d2a2310275b318eb51eac', - 'info_dict': { - 'id': '1335611', - 'display_id': 'Vicces_cica_1', - 'ext': 'mp4', - 'title': 'Vicces cica', - 'description': 'Játszik a tablettel. :D', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Jet_Pack', - 'uploader_id': '491217', - 'timestamp': 1390821212, - 'upload_date': '20140127', - 'duration': 7, - 'age_limit': 0, - 'tags': list, - }, - }, { - 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', - 'only_matching': True, - }, { - 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko', - 'only_matching': True, - }, { - 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci', - 'only_matching': True, - }, { - 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt', - 'only_matching': True, - }, { - 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - embed_url = self._search_regex( - (r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.indavideo\.hu/player/video/.+?)\1', - r']+rel="video_src"[^>]+href="(?P.+?)"'), - webpage, 'embed url', group='url') - - return { - '_type': 'url_transparent', - 'ie_key': 'IndavideoEmbed', - 'url': embed_url, - 'display_id': display_id, - } From f4d261b765a17ef2beccec78680ec693c7df014c Mon Sep 17 00:00:00 2001 From: Enes Date: Tue, 24 Apr 2018 22:48:40 +0300 Subject: [PATCH 173/488] [izlesene] Fix extraction (closes #16233) --- youtube_dl/extractor/izlesene.py | 33 ++++++++++---------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index b1d72177d..5b2095490 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( @@ -72,7 +70,7 @@ class IzleseneIE(InfoExtractor): 'uploadDate', webpage, 'upload date')) duration = float_or_none(self._html_search_regex( - r'"videoduration"\s*:\s*"([^"]+)"', + r'videoduration\s*=\s*\'([^\']+)\'', webpage, 'duration', fatal=False), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) @@ -80,29 +78,18 @@ class IzleseneIE(InfoExtractor): r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - content_url = self._html_search_meta( - 'contentURL', webpage, 'content URL', fatal=False) - ext = determine_ext(content_url, 'mp4') - - # Might be empty for some videos. - streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') + streams_json = self._html_search_regex( + r'_videoObj\s*=\s*(.+);', webpage, 'streams') + streams = self._parse_json(streams_json, video_id) formats = [] - if streams: - for stream in streams.split('|'): - quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() - formats.append({ - 'format_id': '%sp' % quality if quality else 'sd', - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - else: - stream_url = self._search_regex( - r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') + for stream in streams.get('media').get('level'): + url = stream.get('source') + ext = determine_ext(url, 'mp4') + quality = stream.get('value') formats.append({ - 'format_id': 'sd', - 'url': compat_urllib_parse_unquote(stream_url), + 'format_id': '%sp' % quality, + 'url': compat_urllib_parse_unquote(url), 'ext': ext, }) From 03fad17cb6ae24259808078a165c287c23d77f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:51:38 +0700 Subject: [PATCH 174/488] [izlesene] Improve extraction and fix issues (closes #16407, closes #16271) --- youtube_dl/extractor/izlesene.py | 55 +++++++++++++++++++------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 5b2095490..f8fca6c8f 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, float_or_none, @@ -55,12 +58,33 @@ class IzleseneIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://www.izlesene.com/video/%s' % video_id - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('http://www.izlesene.com/video/%s' % video_id, video_id) + + video = self._parse_json( + self._search_regex( + r'videoObj\s*=\s*({.+?})\s*;\s*\n', webpage, 'streams'), + video_id) + + title = video.get('videoTitle') or self._og_search_title(webpage) + + formats = [] + for stream in video['media']['level']: + source_url = stream.get('source') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(url, 'mp4') + quality = stream.get('value') + height = int_or_none(quality) + formats.append({ + 'format_id': '%sp' % quality if quality else 'sd', + 'url': compat_urllib_parse_unquote(source_url), + 'ext': ext, + 'height': height, + }) + self._sort_formats(formats) - title = self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) - thumbnail = self._proto_relative_url( + thumbnail = video.get('posterURL') or self._proto_relative_url( self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( @@ -69,30 +93,15 @@ class IzleseneIE(InfoExtractor): timestamp = parse_iso8601(self._html_search_meta( 'uploadDate', webpage, 'upload date')) - duration = float_or_none(self._html_search_regex( - r'videoduration\s*=\s*\'([^\']+)\'', - webpage, 'duration', fatal=False), scale=1000) + duration = float_or_none(video.get('duration') or self._html_search_regex( + r'videoduration["\']?\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'duration', fatal=False, group='value'), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) comment_count = self._html_search_regex( r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - streams_json = self._html_search_regex( - r'_videoObj\s*=\s*(.+);', webpage, 'streams') - streams = self._parse_json(streams_json, video_id) - - formats = [] - for stream in streams.get('media').get('level'): - url = stream.get('source') - ext = determine_ext(url, 'mp4') - quality = stream.get('value') - formats.append({ - 'format_id': '%sp' % quality, - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - return { 'id': video_id, 'title': title, From 9ef5cdb5cb637660decbc82117d5d6790c48ad99 Mon Sep 17 00:00:00 2001 From: rhhayward Date: Fri, 25 May 2018 14:13:29 -0500 Subject: [PATCH 175/488] [audiomack] Stringify video id (closes #15310) --- youtube_dl/extractor/audiomack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index f3bd4d444..62049b921 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -65,7 +65,7 @@ class AudiomackIE(InfoExtractor): return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} return { - 'id': api_response.get('id', album_url_tag), + 'id': compat_str(api_response.get('id', album_url_tag)), 'uploader': api_response.get('artist'), 'title': api_response.get('title'), 'url': api_response['url'], From bdbcc8eecb6d498e5c33dcbfb330d7d82021b3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Nov=C3=A1k?= Date: Fri, 25 May 2018 21:15:50 +0200 Subject: [PATCH 176/488] [dvtv] Remove dead test --- youtube_dl/extractor/dvtv.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 3f760888e..20996962a 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -91,17 +91,6 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, - }, { - 'url': 'https://video.aktualne.cz/dvtv/babis-a-zeman-nesou-vinu-za-to-ze-nemame-jasno-v-tom-kdo-bud/r~026afb54fad711e79704ac1f6b220ee8/', - 'md5': '87defe16681b1429c91f7a74809823c6', - 'info_dict': { - 'id': 'f5ae72f6fad611e794dbac1f6b220ee8', - 'ext': 'mp4', - 'title': 'Babiš a Zeman nesou vinu za to, že nemáme jasno v tom, kdo bude vládnout, říká Pekarová Adamová', - }, - 'params': { - 'skip_download': True, - }, }] def _parse_video_metadata(self, js, video_id, live_js=None): From 5a16c9d9d37389d163b0004f1c9332764a50ef83 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 25 May 2018 23:12:18 +0100 Subject: [PATCH 177/488] [utils] keep the original TV_PARENTAL_GUIDELINES dict --- youtube_dl/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d61af8837..7b4fd882f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2253,12 +2253,12 @@ US_RATINGS = { TV_PARENTAL_GUIDELINES = { - 'Y': 0, - 'Y7': 7, - 'G': 0, - 'PG': 0, - '14': 14, - 'MA': 17, + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, } @@ -2272,9 +2272,9 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - m = re.match(r'^TV[_-]?(%s)$' % '|'.join(TV_PARENTAL_GUIDELINES.keys()), s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) if m: - return TV_PARENTAL_GUIDELINES[m.group(1)] + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] return None From 38e4e8ab80b784f59b3a3ef6d313a70e13f17cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 12:58:34 +0700 Subject: [PATCH 178/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/ChangeLog b/ChangeLog index 08233cd5b..9d0264bf7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,31 @@ +version + +Core +* [utils] Improve parse_age_limit + +Extractors +* [audiomack] Stringify video id (#15310) +* [izlesene] Fix extraction (#16233, #16271, #16407) ++ [indavideo] Add support for generic embeds (#11989) +* [indavideo] Fix extraction (#11221) +* [indavideo] Sign download URLs (#16174) ++ [peertube] Add support for PeerTube based sites (#16301, #16329) +* [imgur] Fix extraction (#16537) ++ [hidive] Add support for authentication (#16534) ++ [nbc] Add support for stream.nbcsports.com (#13911) ++ [viewlift] Add support for hoichoi.tv (#16536) +* [go90] Extract age limit and detect DRM protection(#10127) +* [viewlift] fix extraction for snagfilms.com (#15766) +* [globo] Improve extraction (#4189) + * Add support for authentication + * Simplify URL signing + * Extract DASH and MSS formats +* [leeco] Fix extraction (#16464) +* [teamcoco] Add fallback for format extraction (#16484) +* [teamcoco] Improve URL regular expression (#16484) +* [imdb] Improve extraction (#4085, #14557) + + version 2018.05.18 Extractors From 0934c9d4faadbfd2b076d13c7e24f4bf039cdc79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 13:02:21 +0700 Subject: [PATCH 179/488] release 2018.05.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 13 ++++++------- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7d9de5171..c4d4e534e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.18 +[debug] youtube-dl version 2018.05.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 9d0264bf7..280390ea0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.26 Core * [utils] Improve parse_age_limit diff --git a/README.md b/README.md index 20982b0f1..499a0c206 100644 --- a/README.md +++ b/README.md @@ -93,8 +93,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable experimental SOCKS proxy, specify - a proper scheme. For example + To enable SOCKS proxy, specify a proper + scheme. For example socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds @@ -109,16 +109,15 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo option is not present) is used for the actual downloading. --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header - (experimental) --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 - country code (experimental) + country code --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with explicitly provided IP block in CIDR - notation (experimental) + notation ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) @@ -209,7 +208,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with - expected file size (experimental) + expected file size --hls-prefer-native Use the native HLS downloader instead of ffmpeg --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1048cc4c..b60f2ff23 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -365,7 +365,6 @@ - **ImgurAlbum** - **Ina** - **Inc** - - **Indavideo** - **IndavideoEmbed** - **InfoQ** - **Instagram** @@ -526,6 +525,7 @@ - **nbcolympics** - **nbcolympics:stream** - **NBCSports** + - **NBCSportsStream** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk - **ndr:embed** @@ -625,6 +625,7 @@ - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - **PearVideo** + - **PeerTube** - **People** - **PerformGroup** - **periscope**: Periscope diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a43eec860..2253da927 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.18' +__version__ = '2018.05.26' From c678192af3f004205b18a16b7418cbd937c1b584 Mon Sep 17 00:00:00 2001 From: Zack Fernandes Date: Sun, 31 Dec 2017 13:55:35 -0800 Subject: [PATCH 180/488] [tumblr] Add support for authentication --- youtube_dl/extractor/tumblr.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 786143525..58ac66755 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,11 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + ExtractorError, + int_or_none, + sanitized_Request, + urlencode_postdata +) class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P[0-9]+)(?:$|[/?#])' + _NETRC_MACHINE = 'tumblr' + _LOGIN_URL = 'https://www.tumblr.com/login' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -97,6 +104,31 @@ class TumblrIE(InfoExtractor): 'add_ie': ['Instagram'], }] + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + webpage = self._download_webpage(self._LOGIN_URL, None, False) + form = self._hidden_inputs(webpage) + form.update({ + 'user[email]': username, + 'user[password]': password + }) + login_response = self._download_webpage( + sanitized_Request(self._LOGIN_URL, urlencode_postdata(form), { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL + }), None, False, 'Wrong login info') + + # Check the login response from Tumblr for an error message and fail the extraction if we find one. + login_errors = self._search_regex(r'Tumblr\.RegistrationForm\.errors\s*=\s*\[[\"|\'](.+)[\"|\']\]', login_response, 'login errors', False) + if login_errors: + raise ExtractorError('Error logging in: %s' % login_errors, expected=True) + def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') From 56cd31f32015cce131fb40a112d323da57fdda8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 19:53:32 +0700 Subject: [PATCH 181/488] [tumblr] Improve authentication (closes #15133) --- youtube_dl/extractor/tumblr.py | 39 ++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 58ac66755..758ccbb44 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata ) @@ -111,23 +110,37 @@ class TumblrIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: return - self.report_login() - webpage = self._download_webpage(self._LOGIN_URL, None, False) - form = self._hidden_inputs(webpage) - form.update({ + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ 'user[email]': username, 'user[password]': password }) - login_response = self._download_webpage( - sanitized_Request(self._LOGIN_URL, urlencode_postdata(form), { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL - }), None, False, 'Wrong login info') - # Check the login response from Tumblr for an error message and fail the extraction if we find one. - login_errors = self._search_regex(r'Tumblr\.RegistrationForm\.errors\s*=\s*\[[\"|\'](.+)[\"|\']\]', login_response, 'login errors', False) + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + + # Successful login + if '/dashboard' in urlh.geturl(): + return + + login_errors = self._parse_json( + self._search_regex( + r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, + 'login errors', default='[]'), + None, fatal=False) if login_errors: - raise ExtractorError('Error logging in: %s' % login_errors, expected=True) + raise ExtractorError( + 'Unable to login: %s' % login_errors[0], expected=True) + + self.report_warning('Login has probably failed') def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) From 97b01144bd9771f224749ffca10156a1cd7e9c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 20:00:00 +0700 Subject: [PATCH 182/488] [tumblr] Detect and report sensitive media (closes #13829) --- youtube_dl/extractor/tumblr.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 758ccbb44..89e6eb5ab 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -150,11 +151,19 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) + redirect_url = compat_str(urlh.geturl()) + if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): + raise ExtractorError( + 'This Tumblr may contain sensitive media. ' + 'Disable safe mode in your account settings ' + 'at https://www.tumblr.com/settings/account#safe_mode', + expected=True) + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url', default=None) if iframe_url is None: - return self.url_result(urlh.geturl(), 'Generic') + return self.url_result(redirect_url, 'Generic') iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') From 986c0b0215b127713825fa1523966ac66e03157b Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 26 May 2018 08:05:54 -0500 Subject: [PATCH 183/488] [cbc] Fix playlist title extraction (closes #16502) --- youtube_dl/extractor/cbc.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 54b4b9be9..ce8e3d346 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -20,6 +20,7 @@ from ..utils import ( parse_duration, parse_iso8601, parse_age_limit, + strip_or_none, int_or_none, ExtractorError, ) @@ -129,6 +130,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'([^<]+)', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -136,8 +140,7 @@ class CBCIE(InfoExtractor): self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) return self.playlist_result( - entries, display_id, - self._og_search_title(webpage, fatal=False), + entries, display_id, strip_or_none(title), self._og_search_description(webpage)) From c0fd20abcad16bb2e377b6342a894a374c219763 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 14:34:13 +0100 Subject: [PATCH 184/488] [soundcloud] detect format extension(closes #16549) --- youtube_dl/extractor/soundcloud.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 46332e5c2..81c81c8d5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -181,7 +181,6 @@ class SoundcloudIE(InfoExtractor): thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') - ext = 'mp3' result = { 'id': track_id, 'uploader': info.get('user', {}).get('username'), @@ -215,8 +214,11 @@ class SoundcloudIE(InfoExtractor): track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): - abr = int_or_none(self._search_regex( - r'_(\d+)_url', key, 'audio bitrate', default=None)) + ext, abr = 'mp3', None + mobj = re.search(r'_([^_]+)_(\d+)_url', key) + if mobj: + ext, abr = mobj.groups() + abr = int(abr) if key.startswith('http'): stream_formats = [{ 'format_id': key, @@ -234,13 +236,14 @@ class SoundcloudIE(InfoExtractor): }] elif key.startswith('hls'): stream_formats = self._extract_m3u8_formats( - stream_url, track_id, 'mp3', entry_protocol='m3u8_native', + stream_url, track_id, ext, entry_protocol='m3u8_native', m3u8_id=key, fatal=False) else: continue - for f in stream_formats: - f['abr'] = abr + if abr: + for f in stream_formats: + f['abr'] = abr formats.extend(stream_formats) @@ -250,7 +253,7 @@ class SoundcloudIE(InfoExtractor): formats.append({ 'format_id': 'fallback', 'url': update_url_query(info['stream_url'], query), - 'ext': ext, + 'ext': 'mp3', }) for f in formats: From 261f47306c594614edb8a5f0b8f5f3b8a87ce9c0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 14:35:47 +0100 Subject: [PATCH 185/488] [utils] fix style id extraction for namespaced id attribute(closes #16551) --- youtube_dl/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7b4fd882f..63f24c0b6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2667,6 +2667,7 @@ def dfxp2srt(dfxp_data): ] _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', 'ttml': 'http://www.w3.org/ns/ttml', 'tts': 'http://www.w3.org/ns/ttml#styling', }) @@ -2758,7 +2759,9 @@ def dfxp2srt(dfxp_data): repeat = False while True: for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue parent_style_id = style.get('style') if parent_style_id: if parent_style_id not in styles: From 2a49d01992e0b4b87d78da8f83af2f6e57fb8ba8 Mon Sep 17 00:00:00 2001 From: mars67857 Date: Sat, 14 Oct 2017 22:09:44 -0700 Subject: [PATCH 186/488] [cammodels] Add extractor --- youtube_dl/extractor/cammodels.py | 93 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 94 insertions(+) create mode 100644 youtube_dl/extractor/cammodels.py diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py new file mode 100644 index 000000000..1711d7096 --- /dev/null +++ b/youtube_dl/extractor/cammodels.py @@ -0,0 +1,93 @@ +from __future__ import unicode_literals +from .common import InfoExtractor +from .common import ExtractorError +import json +import re +from ..utils import int_or_none + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P\w+)' + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' + # Needed because server doesn't return links to video URLs if a browser-like User-Agent is not used + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, + video_id, + headers=self._HEADERS) + manifest_url_root = self._html_search_regex( + r'manifestUrlRoot=(?Phttps?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))', + webpage, + 'manifest', + None, + False) + if not manifest_url_root: + offline = self._html_search_regex( + r'(?PI\'m offline, but let\'s stay connected!)', + webpage, + 'offline indicator', + None, + False) + private = self._html_search_regex( + r'(?PI’m in a private show right now)', + webpage, + 'private show indicator', + None, + False) + err = 'This user is currently offline, so nothing can be downloaded.' if offline \ + else 'This user is doing a private show, which requires payment. This extractor currently does not support private streams.' if private \ + else 'Unable to find link to stream info on webpage. Room is not offline, so something else is wrong.' + raise ExtractorError( + err, + expected=True if offline or private else False, + video_id=video_id + ) + manifest_url = manifest_url_root + video_id + '.json' + manifest = self._download_json( + manifest_url, + video_id, + 'Downloading links to streams.', + 'Link to stream URLs was found, but we couldn\'t access it.', + headers=self._HEADERS) + try: + formats = [] + for fmtName in ['mp4-rtmp', 'mp4-hls']: + for encoding in manifest['formats'][fmtName]['encodings']: + formats.append({ + 'ext': 'mp4', + 'url': encoding['location'], + 'width': int_or_none(encoding.get('videoWidth')), + 'height': int_or_none(encoding.get('videoHeight')), + 'vbr': int_or_none(encoding.get('videoKbps')), + 'abr': int_or_none(encoding.get('audioKbps')), + 'format_id': fmtName + str(encoding.get('videoWidth')) + }) + # If they change the JSON format, then fallback to parsing out RTMP links via regex. + except KeyError: + manifest_json = json.dumps(manifest) + manifest_links = re.finditer( + r'(?Prtmp?:\/\/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#&//=]*))', + manifest_json) + if not manifest_links: + raise ExtractorError( + 'Link to stream info was found, but we couldn\'t read the response. This is probably a bug.', + expected=False, + video_id=video_id) + formats = [] + for manifest_link in manifest_links: + url = manifest_link.group('id') + formats.append({ + 'ext': 'mp4', + 'url': url, + 'format_id': url.split(sep='/')[-1] + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'formats': formats + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9b49a0cd..d54e8df9f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -145,6 +145,7 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .cammodels import CamModelsIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE From 8b1da46e8f6dd0de790a54a4809d224041262537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 21:25:01 +0700 Subject: [PATCH 187/488] [cammodels] Improve and simplify (closes #14499) --- youtube_dl/extractor/cammodels.py | 159 +++++++++++++++--------------- 1 file changed, 80 insertions(+), 79 deletions(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 1711d7096..4f1b88d14 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -1,93 +1,94 @@ +# coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor -from .common import ExtractorError -import json -import re -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, +) class CamModelsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P\w+)' - _HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' - # Needed because server doesn't return links to video URLs if a browser-like User-Agent is not used - } + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, - video_id, - headers=self._HEADERS) - manifest_url_root = self._html_search_regex( - r'manifestUrlRoot=(?Phttps?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))', - webpage, - 'manifest', - None, - False) - if not manifest_url_root: - offline = self._html_search_regex( - r'(?PI\'m offline, but let\'s stay connected!)', - webpage, - 'offline indicator', - None, - False) - private = self._html_search_regex( - r'(?PI’m in a private show right now)', - webpage, - 'private show indicator', - None, - False) - err = 'This user is currently offline, so nothing can be downloaded.' if offline \ - else 'This user is doing a private show, which requires payment. This extractor currently does not support private streams.' if private \ - else 'Unable to find link to stream info on webpage. Room is not offline, so something else is wrong.' - raise ExtractorError( - err, - expected=True if offline or private else False, - video_id=video_id + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), ) - manifest_url = manifest_url_root + video_id + '.json' + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + manifest = self._download_json( - manifest_url, - video_id, - 'Downloading links to streams.', - 'Link to stream URLs was found, but we couldn\'t access it.', - headers=self._HEADERS) - try: - formats = [] - for fmtName in ['mp4-rtmp', 'mp4-hls']: - for encoding in manifest['formats'][fmtName]['encodings']: - formats.append({ + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = media.get('location') + if not media_url or not isinstance(media_url, compat_str): + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ 'ext': 'mp4', - 'url': encoding['location'], - 'width': int_or_none(encoding.get('videoWidth')), - 'height': int_or_none(encoding.get('videoHeight')), - 'vbr': int_or_none(encoding.get('videoKbps')), - 'abr': int_or_none(encoding.get('audioKbps')), - 'format_id': fmtName + str(encoding.get('videoWidth')) + # hls skips fragments, preferring rtmp + 'preference': -1, }) - # If they change the JSON format, then fallback to parsing out RTMP links via regex. - except KeyError: - manifest_json = json.dumps(manifest) - manifest_links = re.finditer( - r'(?Prtmp?:\/\/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#&//=]*))', - manifest_json) - if not manifest_links: - raise ExtractorError( - 'Link to stream info was found, but we couldn\'t read the response. This is probably a bug.', - expected=False, - video_id=video_id) - formats = [] - for manifest_link in manifest_links: - url = manifest_link.group('id') - formats.append({ - 'ext': 'mp4', - 'url': url, - 'format_id': url.split(sep='/')[-1] - }) + else: + continue + formats.append(f) self._sort_formats(formats) + return { - 'id': video_id, - 'title': self._live_title(video_id), - 'formats': formats + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, } From ec2f3d2800185920629a7e6946701edebbf14dd6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 15:34:36 +0100 Subject: [PATCH 188/488] [ufctv] add support for authentication(closes #16542) --- youtube_dl/extractor/ufctv.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py index ab823814b..f3eaee6b3 100644 --- a/youtube_dl/extractor/ufctv.py +++ b/youtube_dl/extractor/ufctv.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, parse_duration, parse_iso8601, + urlencode_postdata, ) class UFCTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P[^/]+)' + _NETRC_MACHINE = 'ufctv' _TEST = { 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', 'info_dict': { @@ -26,6 +29,21 @@ class UFCTVIE(InfoExtractor): } } + def _real_initialize(self): + username, password = self._get_login_info() + if username is None: + return + + code = self._download_json( + 'https://www.ufc.tv/secure/authenticate', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + 'format': 'json', + })).get('code') + if code and code != 'loginsuccess': + raise ExtractorError(code, expected=True) + def _real_extract(self, url): display_id = self._match_id(url) video_data = self._download_json(url, display_id, query={ From 68217024e83c8e7965f2800e9ff7a9575f049b5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 16:12:44 +0100 Subject: [PATCH 189/488] remove unnecessary assignment parenthesis --- youtube_dl/extractor/animeondemand.py | 2 +- youtube_dl/extractor/atresplayer.py | 2 +- youtube_dl/extractor/bambuser.py | 2 +- youtube_dl/extractor/crunchyroll.py | 2 +- youtube_dl/extractor/curiositystream.py | 2 +- youtube_dl/extractor/dramafever.py | 2 +- youtube_dl/extractor/facebook.py | 2 +- youtube_dl/extractor/fc2.py | 2 +- youtube_dl/extractor/funimation.py | 2 +- youtube_dl/extractor/gdcvault.py | 2 +- youtube_dl/extractor/globo.py | 5 ----- youtube_dl/extractor/hidive.py | 7 +------ youtube_dl/extractor/hrti.py | 2 +- youtube_dl/extractor/iqiyi.py | 2 +- youtube_dl/extractor/niconico.py | 2 +- youtube_dl/extractor/noco.py | 2 +- youtube_dl/extractor/packtpub.py | 2 +- youtube_dl/extractor/patreon.py | 2 +- youtube_dl/extractor/pluralsight.py | 2 +- youtube_dl/extractor/roosterteeth.py | 2 +- youtube_dl/extractor/safari.py | 2 +- youtube_dl/extractor/sina.py | 2 +- youtube_dl/extractor/tennistv.py | 2 +- youtube_dl/extractor/tubitv.py | 2 +- youtube_dl/extractor/tumblr.py | 2 +- youtube_dl/extractor/twitch.py | 2 +- youtube_dl/extractor/udemy.py | 2 +- youtube_dl/extractor/vessel.py | 2 +- youtube_dl/extractor/viki.py | 2 +- youtube_dl/extractor/vimeo.py | 2 +- youtube_dl/extractor/vk.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- youtube_dl/extractor/zattoo.py | 2 +- 33 files changed, 32 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index e4fa72f46..1fe5d5e56 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -52,7 +52,7 @@ class AnimeOnDemandIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 1a31ebe08..ae1c09427 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -74,7 +74,7 @@ class AtresPlayerIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 633c57553..34f1b3d83 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -44,7 +44,7 @@ class BambuserIE(InfoExtractor): } def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 3efdc8c21..311da515d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -49,7 +49,7 @@ class CrunchyrollBaseIE(InfoExtractor): }) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 8e45923e3..35b1e7a34 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -35,7 +35,7 @@ class CuriosityStreamBaseIE(InfoExtractor): return result['data'] def _real_initialize(self): - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return result = self._download_json( diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ffbd2623d..ab32ba4ff 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -42,7 +42,7 @@ class DramaFeverBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 220ada3a6..0971ce356 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -226,7 +226,7 @@ class FacebookIE(InfoExtractor): return urls def _login(self): - (useremail, password) = self._get_login_info() + useremail, password = self._get_login_info() if useremail is None: return diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 448647d72..435561147 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -46,7 +46,7 @@ class FC2IE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: return False diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 107f658ba..07d01caec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -51,7 +51,7 @@ class FunimationIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index f71d9092e..8806dc48a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -91,7 +91,7 @@ class GDCVaultIE(InfoExtractor): ] def _login(self, webpage_url, display_id): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') return None diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 81d6d36d3..c2140c362 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -23,7 +23,6 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' - _LOGGED_IN = False _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', @@ -68,9 +67,6 @@ class GloboIE(InfoExtractor): }] def _real_initialize(self): - if self._LOGGED_IN: - return - email, password = self._get_login_info() if email is None: return @@ -91,7 +87,6 @@ class GloboIE(InfoExtractor): resp = self._parse_json(e.cause.read(), None) raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) raise - self._LOGGED_IN = True def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index d8f2e682f..39fabe8a5 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -18,7 +18,6 @@ class HiDiveIE(InfoExtractor): # so disabling geo bypass completely _GEO_BYPASS = False _NETRC_MACHINE = 'hidive' - _LOGGED_IN = False _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ @@ -38,10 +37,7 @@ class HiDiveIE(InfoExtractor): }] def _real_initialize(self): - if self._LOGGED_IN: - return - - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return @@ -56,7 +52,6 @@ class HiDiveIE(InfoExtractor): }) self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) - self._LOGGED_IN = True def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 6424d34ac..9ba1aa703 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -66,7 +66,7 @@ class HRTiBaseIE(InfoExtractor): self._logout_url = modules['user']['resources']['logout']['uri'] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # TODO: figure out authentication with cookies if username is None or password is None: self.raise_login_required() diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index fdfa7de9e..4b081bd46 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -239,7 +239,7 @@ class IqiyiIE(InfoExtractor): return ohdave_rsa_encrypt(data, e, N) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index df7f528be..dbe871f16 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -163,7 +163,7 @@ class NiconicoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: return True diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index a9f9b10c4..58b371ed7 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -65,7 +65,7 @@ class NocoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 8ed3c6347..56a2a1083 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -42,7 +42,7 @@ class PacktPubIE(PacktPubBaseIE): _TOKEN = None def _real_initialize(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index d4b1d34ca..9eb027679 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -53,7 +53,7 @@ class PatreonIE(InfoExtractor): # needed. Keeping this commented for when this inevitably changes. ''' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 3c508c9ca..a207ca9cb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -94,7 +94,7 @@ class PluralsightIE(PluralsightBaseIE): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8b703800e..857434540 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -50,7 +50,7 @@ class RoosterTeethIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index cc6698f88..8a5d48fc2 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -27,7 +27,7 @@ class SafariBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 8fc66732a..07b766b4a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -64,7 +64,7 @@ class SinaIE(InfoExtractor): # The video id is in the redirected url self.to_screen('Getting video id') request = HEADRequest(url) - (_, urlh) = self._download_webpage_handle(request, 'NA', False) + _, urlh = self._download_webpage_handle(request, 'NA', False) return self._real_extract(urlh.geturl()) else: pseudo_id = mobj.group('pseudo_id') diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index 0c6f70784..a586f30ad 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -32,7 +32,7 @@ class TennisTVIE(InfoExtractor): _NETRC_MACHINE = 'tennistv' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 36f6c1673..a51fa6515 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -36,7 +36,7 @@ class TubiTvIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 89e6eb5ab..edbb0aa69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -108,7 +108,7 @@ class TumblrIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 3ee2af52e..e01f11331 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -61,7 +61,7 @@ class TwitchBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 0a74a9768..a7196997e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -151,7 +151,7 @@ class UdemyIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 80a643dfe..31eee0ba7 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -75,7 +75,7 @@ class VesselIE(InfoExtractor): 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ad2a2a4b7..546de95d8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -88,7 +88,7 @@ class VikiBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8dfd8891c..3baa2d075 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -37,7 +37,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_URL = 'https://vimeo.com/log_in' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: if self._LOGIN_REQUIRED: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b50d4f170..29002b35f 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -32,7 +32,7 @@ class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e4eec7c30..379559825 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -85,7 +85,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 773073d85..b5a3a0716 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -24,7 +24,7 @@ class ZattooBaseIE(InfoExtractor): _power_guide_hash = None def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: self.raise_login_required( 'A valid %s account is needed to access this media.' From ddd8486a448ee94134a62f2488e5e39bbd72880e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:10:08 +0700 Subject: [PATCH 190/488] [downloader/rtmp] Gracefully handle live streams interrupted by user --- youtube_dl/downloader/rtmp.py | 119 +++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 53 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index b823b5171..63e2b5c89 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -24,71 +24,78 @@ class RtmpFD(FileDownloader): def real_download(self, filename, info_dict): def run_rtmpdump(args): start = time.time() - resume_percent = None - resume_downloaded_data_len = None proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True - proc_stderr_closed = False - while not proc_stderr_closed: - # read line from stderr - line = '' - while True: - char = proc.stderr.read(1) - if not char: - proc_stderr_closed = True - break - if char in [b'\r', b'\n']: - break - line += char.decode('ascii', 'replace') - if not line: - # proc_stderr_closed is True - continue - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - percent = float(mobj.group(2)) - if not resume_percent: - resume_percent = percent - resume_downloaded_data_len = downloaded_data_len - time_now = time.time() - eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) - data_len = None - if percent > 0: - data_len = int(downloaded_data_len * 100 / percent) - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': downloaded_data_len, - 'total_bytes_estimate': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - else: - # no percent for live streams - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + + def dl(): + resume_percent = None + resume_downloaded_data_len = None + proc_stderr_closed = False + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) if mobj: downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len time_now = time.time() - speed = self.calc_speed(start, time_now, downloaded_data_len) + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) self._hook_progress({ + 'status': 'downloading', 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, 'tmpfilename': tmpfilename, 'filename': filename, - 'status': 'downloading', + 'eta': eta, 'elapsed': time_now - start, 'speed': speed, }) cursor_in_new_line = False - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen('') - cursor_in_new_line = True - self.to_screen('[rtmpdump] ' + line) - proc.wait() + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + + try: + dl() + finally: + proc.wait() + if not cursor_in_new_line: self.to_screen('') return proc.returncode @@ -163,7 +170,13 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 - retval = run_rtmpdump(args) + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') if retval == RD_NO_CONNECT: self.report_error('[rtmpdump] Could not connect to RTMP server.') From f16f48779cbad4a6d39a908e131a8d55941d1671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:14:09 +0700 Subject: [PATCH 191/488] [downloader/rtmp] Generalize download messages and report time elapsed on finish --- youtube_dl/downloader/rtmp.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 63e2b5c89..9e0ddbb18 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -170,6 +170,8 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 + started = time.time() + try: retval = run_rtmpdump(args) except KeyboardInterrupt: @@ -184,7 +186,7 @@ class RtmpFD(FileDownloader): while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % prevsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed args = basic_args + ['--resume'] if retval == RD_FAILED: @@ -201,13 +203,14 @@ class RtmpFD(FileDownloader): break if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % fsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, 'total_bytes': fsize, 'filename': filename, 'status': 'finished', + 'elapsed': time.time() - started, }) return True else: From 2ce35d9f43328e82108bae6661c2ac0ba2a0498c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:21:55 +0700 Subject: [PATCH 192/488] [cammodels] Add another error pattern --- youtube_dl/extractor/cammodels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 4f1b88d14..17f7ac043 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -28,6 +28,7 @@ class CamModelsIE(InfoExtractor): ERRORS = ( ("I'm offline, but let's stay connected", 'This user is currently offline'), ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), ) for pattern, message in ERRORS: if pattern in webpage: From 8882840ec5d9536772d7de75b7fb6389103a3a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:22:28 +0700 Subject: [PATCH 193/488] [cammodels] Use geo verification headers --- youtube_dl/extractor/cammodels.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 17f7ac043..ee0165dba 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -19,7 +19,8 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage(url, user_id) + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) manifest_root = self._html_search_regex( r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) From c9e12a618c9420c2bb21c09bf47b9469785f492e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 May 2018 12:10:12 +0100 Subject: [PATCH 194/488] [9c9media] extract mpd formats and subtitles --- youtube_dl/extractor/ctvnews.py | 4 +- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/ninecninemedia.py | 93 ++++++++++---------------- youtube_dl/extractor/rds.py | 2 +- 4 files changed, 41 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index 55a127b76..03f8cefb7 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -11,10 +11,10 @@ class CTVNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', - 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'md5': '9b8624ba66351a23e0b6e1391971f9af', 'info_dict': { 'id': '901995', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Extended: \'That person cannot be me\' Johnson says', 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', 'timestamp': 1467286284, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d54e8df9f..2f485012f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -718,10 +718,7 @@ from .nick import ( NickRuIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import ( - NineCNineMediaStackIE, - NineCNineMediaIE, -) +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 8961309fd..875665d43 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -13,38 +13,11 @@ from ..utils import ( ) -class NineCNineMediaBaseIE(InfoExtractor): - _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' - - -class NineCNineMediaStackIE(NineCNineMediaBaseIE): - IE_NAME = '9c9media:stack' - _GEO_COUNTRIES = ['CA'] - _VALID_URL = r'9c9media:stack:(?P[^:]+):(?P\d+):(?P\d+):(?P\d+)' - - def _real_extract(self, url): - destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() - stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' - stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) - - formats = [] - formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', stack_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', stack_id, - f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - return { - 'id': stack_id, - 'formats': formats, - } - - -class NineCNineMediaIE(NineCNineMediaBaseIE): +class NineCNineMediaIE(InfoExtractor): IE_NAME = '9c9media' + _GEO_COUNTRIES = ['CA'] _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' def _real_extract(self, url): destination_code, content_id = re.match(self._VALID_URL, url).groups() @@ -58,13 +31,26 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id - content_package = self._download_json(content_package_url, content_id) + content_package = self._download_json( + content_package_url, content_id, query={ + '$include': '[HasClosedCaptions]', + }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + if content_package.get('Constraints', {}).get('Security', {}).get('Type'): raise ExtractorError('This video is DRM protected.', expected=True) - stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] - multistacks = len(stacks) > 1 + manifest_base_url = content_package_url + 'manifest.' + formats = [] + formats.extend(self._extract_m3u8_formats( + manifest_base_url + 'm3u8', content_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_base_url + 'f4m', content_id, + f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', content_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) thumbnails = [] for image in content.get('Images', []): @@ -85,10 +71,12 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): continue container.append(e_name) - description = content.get('Desc') or content.get('ShortDesc') season = content.get('Season', {}) - base_info = { - 'description': description, + + info = { + 'id': content_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), @@ -97,26 +85,19 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): 'series': content.get('Media', {}).get('Name'), 'tags': tags, 'categories': categories, + 'duration': float_or_none(content_package.get('Duration')), + 'formats': formats, } - entries = [] - for stack in stacks: - stack_id = compat_str(stack['Id']) - entry = { - '_type': 'url_transparent', - 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), - 'id': stack_id, - 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, - 'duration': float_or_none(stack.get('Duration')), - 'ie_key': 'NineCNineMediaStack', + if content_package.get('HasClosedCaptions'): + info['subtitles'] = { + 'en': [{ + 'url': manifest_base_url + 'vtt', + 'ext': 'vtt', + }, { + 'url': manifest_base_url + 'srt', + 'ext': 'srt', + }] } - entry.update(base_info) - entries.append(entry) - return { - '_type': 'multi_video', - 'id': content_id, - 'title': title, - 'description': description, - 'entries': entries, - } + return info diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index bf200ea4d..8c016a77d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -19,7 +19,7 @@ class RDSIE(InfoExtractor): 'info_dict': { 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Fowler Jr. prend la direction de Jacksonville', 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', 'timestamp': 1430397346, From 9c65c4a6cd981e081f4a99d11206e984999f51ff Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 May 2018 12:11:53 +0100 Subject: [PATCH 195/488] [bellmedia] add support for bnnbloomberg.ca(#16560) --- youtube_dl/extractor/bellmedia.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 8820a3914..f36a2452d 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -12,7 +12,7 @@ class BellMediaIE(InfoExtractor): (?: ctv| tsn| - bnn| + bnn(?:bloomberg)?| thecomedynetwork| discovery| discoveryvelocity| @@ -27,17 +27,16 @@ class BellMediaIE(InfoExtractor): much\.com )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ - 'url': 'http://www.ctv.ca/video/player?vid=706966', - 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', + 'md5': '36d3ef559cfe8af8efe15922cd3ce950', 'info_dict': { - 'id': '706966', - 'ext': 'mp4', - 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', - 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', - 'upload_date': '20150919', - 'timestamp': 1442624700, + 'id': '1403070', + 'ext': 'flv', + 'title': 'David Cockfield\'s Top Picks', + 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', + 'upload_date': '20180525', + 'timestamp': 1527288600, }, - 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', 'only_matching': True, @@ -70,6 +69,7 @@ class BellMediaIE(InfoExtractor): 'investigationdiscovery': 'invdisc', 'animalplanet': 'aniplan', 'etalk': 'ctv', + 'bnnbloomberg': 'bnn', } def _real_extract(self, url): From cfd7f2a6365e4d4ed9036b7fd873747be5e91d44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 May 2018 18:24:37 +0700 Subject: [PATCH 196/488] [apa] Add extractor (closes #15041, closes #15672) --- youtube_dl/extractor/apa.py | 94 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 23 ++++++++ 3 files changed, 118 insertions(+) create mode 100644 youtube_dl/extractor/apa.py diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py new file mode 100644 index 000000000..a30a935aa --- /dev/null +++ b/youtube_dl/extractor/apa.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + js_to_json, +) + + +class APAIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'md5': '2b12292faeb0a7d930c778c7a5b4759b', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + }, { + 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', + 'only_matching': True, + }, { + 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', + 'only_matching': True, + }, { + 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + jwplatform_id = self._search_regex( + r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, + 'jwplatform id', default=None) + + if jwplatform_id: + return self.url_result( + 'jwplatform:' + jwplatform_id, ie='JWPlatform', + video_id=video_id) + + sources = self._parse_json( + self._search_regex( + r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + if not isinstance(source, dict): + continue + source_url = source.get('file') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': source_url, + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2f485012f..5f829c72c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,6 +44,7 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0292e0458..dad951b75 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -110,6 +110,7 @@ from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE +from .apa import APAIE class GenericIE(InfoExtractor): @@ -2041,6 +2042,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # APA embed via JWPlatform embed + 'url': 'http://www.vol.at/blue-man-group/5593454', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3068,6 +3086,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + apa_urls = APAIE._extract_urls(webpage) + if apa_urls: + return self.playlist_from_matches( + apa_urls, video_id, video_title, ie=APAIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From a07879d6b2edc474b0595a29932726fa7aa14b3a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 May 2018 00:10:01 +0100 Subject: [PATCH 197/488] [spiegel] fix info extraction(#16538) --- youtube_dl/extractor/spiegel.py | 78 +++++++++++---------------------- 1 file changed, 25 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index fc995e8c1..4df7f4ddc 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -11,9 +11,9 @@ from .nexx import ( from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( - extract_attributes, - unified_strdate, - get_element_by_attribute, + parse_duration, + strip_or_none, + unified_timestamp, ) @@ -21,35 +21,38 @@ class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': '2c2754212136f35fb4b19767d242f66e', + 'md5': 'b57399839d055fccfeb9a0455c439868', 'info_dict': { - 'id': '1259285', + 'id': '563747', 'ext': 'mp4', 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, 'upload_date': '20130311', + 'timestamp': 1362994320, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', 'info_dict': { - 'id': '1309159', + 'id': '580988', 'ext': 'mp4', 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, 'upload_date': '20131115', + 'timestamp': 1384546642, }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', + 'md5': '97b91083a672d72976faa8433430afb9', 'info_dict': { - 'id': '1519126', + 'id': '601883', 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', 'upload_date': '20140904', + 'timestamp': 1409834160, } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -62,59 +65,28 @@ class SpiegelIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage, handle = self._download_webpage_handle(url, video_id) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - nexx_id = self._search_regex( - r'nexxOmniaId\s*:\s*(\d+)', webpage, 'nexx id', default=None) - if nexx_id: - domain_id = NexxIE._extract_domain_id(webpage) or '748' - return self.url_result( - 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), - video_id=nexx_id) - - video_data = extract_attributes(self._search_regex(r'(]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) - - title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) - description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') - - base_url = self._search_regex( - [r'server\s*:\s*(["\'])(?P.+?)\1', r'var\s+server\s*=\s*"(?P[^"]+)\"'], - webpage, 'server URL', group='url') - - xml_url = base_url + video_id + '.xml' - idoc = self._download_xml(xml_url, video_id) - - formats = [] - for n in list(idoc): - if n.tag.startswith('type') and n.tag != 'type6': - format_id = n.tag.rpartition('type')[2] - video_url = base_url + n.find('./filename').text - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int(n.find('./width').text), - 'height': int(n.find('./height').text), - 'abr': int(n.find('./audiobitrate').text), - 'vbr': int(n.find('./videobitrate').text), - 'vcodec': n.find('./codec').text, - 'acodec': 'MP4A', - }) - duration = float(idoc[0].findall('./duration')[0].text) - - self._check_formats(formats, video_id) - self._sort_formats(formats) + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' return { + '_type': 'url_transparent', 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), 'title': title, - 'description': description.strip() if description else None, - 'duration': duration, - 'upload_date': unified_strdate(video_data.get('data-video-date')), - 'formats': formats, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), } From e0d42dd4b270d06a953822c091afefd946bd93f2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 May 2018 13:21:07 +0100 Subject: [PATCH 198/488] [teamcoco] Fix extraction for full episodes(closes #16573) --- youtube_dl/extractor/tbs.py | 61 ++++++------------ youtube_dl/extractor/teamcoco.py | 102 ++++++++++++++++++------------- youtube_dl/extractor/turner.py | 47 +++++++++++++- 3 files changed, 122 insertions(+), 88 deletions(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index edc31729d..784f8ed66 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) from ..utils import ( float_or_none, int_or_none, @@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE): def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(self._search_regex( + drupal_settings = self._parse_json(self._search_regex( r']+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})', - webpage, 'drupal setting'), display_id)['turner_playlist'][0] + webpage, 'drupal setting'), display_id) + video_data = drupal_settings['turner_playlist'][0] media_id = video_data['mediaID'] title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) - streams_data = self._download_json( - 'http://medium.ngtv.io/media/%s/tv' % media_id, - media_id)['media']['tv'] - duration = None - chapters = [] - formats = [] - for supported_type in ('unprotected', 'bulkaes'): - stream_data = streams_data.get(supported_type, {}) - m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') - if not m3u8_url: - continue - if stream_data.get('playlistProtection') == 'spe': - m3u8_url = self._add_akamai_spe_token( - 'http://token.vgtf.net/token/token_spe', - m3u8_url, media_id, { - 'url': url, - 'site_name': site[:3].upper(), - 'auth_required': video_data.get('authRequired') == '1', - }) - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) - - if not chapters: - for chapter in stream_data.get('contentSegments', []): - start_time = float_or_none(chapter.get('start')) - duration = float_or_none(chapter.get('duration')) - if start_time is None or duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + duration, - }) - self._sort_formats(formats) + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) thumbnails = [] for image_id, image in video_data.get('images', {}).items(): @@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE): }) thumbnails.append(i) - return { + info.update({ 'id': media_id, 'title': title, 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), - 'duration': duration, + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), 'timestamp': int_or_none(video_data.get('created')), 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), - 'cahpters': chapters, 'thumbnails': thumbnails, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 63fd4fe1c..73469cc5d 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import json -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..utils import ( determine_ext, ExtractorError, @@ -15,7 +15,7 @@ from ..utils import ( ) -class TeamcocoIE(InfoExtractor): +class TeamcocoIE(TurnerBaseIE): _VALID_URL = r'https?://teamcoco\.com/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { @@ -110,6 +110,8 @@ class TeamcocoIE(InfoExtractor): name } duration + turnerMediaId + turnerMediaAuthToken } } ... on NotFoundSlug { @@ -123,53 +125,65 @@ class TeamcocoIE(InfoExtractor): record = response['record'] video_id = record['id'] - video_sources = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id) or {} - - formats = [] - get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in video_sources.get('src', {}).items(): - if not isinstance(src, dict): - continue - src_url = src.get('src') - if not src_url: - continue - ext = determine_ext(src_url, mimetype2ext(src.get('type'))) - if format_id == 'hls' or ext == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if src_url.startswith('/'): - src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - else: - if src_url.startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - tbr = int_or_none(self._search_regex( - r'(\d+)k\.mp4', src_url, 'tbr', default=None)) - - formats.append({ - 'url': src_url, - 'ext': ext, - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - if not formats: - formats = self._extract_m3u8_formats( - record['file']['url'], video_id, 'mp4', fatal=False) - self._sort_formats(formats) - - return { + info = { 'id': video_id, 'display_id': display_id, - 'formats': formats, 'title': record['title'], 'thumbnail': record.get('thumb', {}).get('preview'), 'description': record.get('teaser'), 'duration': parse_duration(record.get('duration')), 'timestamp': parse_iso8601(record.get('publishOn')), } + + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): + continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index e73b64aeb..2b7b0d6e1 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -9,6 +9,7 @@ from ..utils import ( xpath_text, int_or_none, determine_ext, + float_or_none, parse_duration, xpath_attr, update_url_query, @@ -23,14 +24,17 @@ class TurnerBaseIE(AdobePassIE): def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) if not token: query = { 'path': secure_path, - 'videoId': content_id, } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id if ap_data.get('auth_required'): query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) auth = self._download_xml( @@ -188,3 +192,42 @@ class TurnerBaseIE(AdobePassIE): 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'is_live': is_live, } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } From bc3143ac5e18731502df014e30c5fe89554e9d6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 May 2018 21:52:03 +0700 Subject: [PATCH 199/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 280390ea0..95a5c556f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + version 2018.05.26 Core From e425710554f1ed96504389fb526b898a942012dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 May 2018 21:54:30 +0700 Subject: [PATCH 200/488] release 2018.05.30 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c4d4e534e..b47a450a4 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.30** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.26 +[debug] youtube-dl version 2018.05.30 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 95a5c556f..4e989caf7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.30 Core * [downloader/rtmp] Generalize download messages and report time elapsed diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b60f2ff23..c2d5401d6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -15,7 +15,6 @@ - **8tracks** - **91porn** - **9c9media** - - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** @@ -48,6 +47,7 @@ - **anitube.se** - **Anvato** - **AnySex** + - **APA** - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 @@ -128,6 +128,7 @@ - **BYUtv** - **Camdemy** - **CamdemyFolder** + - **CamModels** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2253da927..0f15738b2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.26' +__version__ = '2018.05.30' From 4fd1437d9d617069494a471ba40341c2ad6623b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 May 2018 17:08:32 +0100 Subject: [PATCH 201/488] [rbmaradio] check formats availability(closes #16585) --- youtube_dl/extractor/rbmaradio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index afa7b9161..9c4d72bbd 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -54,6 +54,7 @@ class RBMARadioIE(InfoExtractor): 'abr': abr, 'vcodec': 'none', } for abr in (96, 128, 256)] + self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) From 128b58ad139f2e62274ab6a649b965f5fa01a533 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 02:49:35 +0100 Subject: [PATCH 202/488] [nhl] remove old extractors --- youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/nhl.py | 345 +++++------------------------ 2 files changed, 62 insertions(+), 290 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5f829c72c..93b22a8c3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -705,12 +705,7 @@ from .nexx import ( from .nfb import NFBIE from .nfl import NFLIE from .nhk import NhkVodIE -from .nhl import ( - NHLVideocenterIE, - NHLNewsIE, - NHLVideocenterCategoryIE, - NHLIE, -) +from .nhl import NHLIE from .nick import ( NickIE, NickBrIE, diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 62ce800c0..cf440f713 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,18 +1,10 @@ from __future__ import unicode_literals import re -import json -import os from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( - unified_strdate, determine_ext, int_or_none, parse_iso8601, @@ -20,236 +12,77 @@ from ..utils import ( ) -class NHLBaseInfoExtractor(InfoExtractor): - @staticmethod - def _fix_json(json_string): - return json_string.replace('\\\'', '\'') +class NHLBaseIE(InfoExtractor): + def _real_extract(self, url): + site, tmp_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://%s/%s/%sid/v1/%s/details/web-v1.json' + % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + if video_data.get('type') != 'video': + video_data = video_data['media'] + video = video_data.get('video') + if video: + video_data = video + else: + videos = video_data.get('videos') + if videos: + video_data = videos[0] - def _real_extract_video(self, video_id): - vid_parts = video_id.split(',') - if len(vid_parts) == 3: - video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0')) - json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id - data = self._download_json( - json_url, video_id, transform_source=self._fix_json) - return self._extract_video(data[0]) + video_id = compat_str(video_data['id']) + title = video_data['title'] - def _extract_video(self, info): - video_id = info['id'] - self.report_extraction(video_id) + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), + }) + self._sort_formats(formats) - initial_video_url = info['publishPoint'] - if info['formats'] == '1': - parsed_url = compat_urllib_parse_urlparse(initial_video_url) - filename, ext = os.path.splitext(parsed_url.path) - path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse_urlencode({ - 'type': 'fvod', - 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) + thumbnails = [] + cuts = video_data.get('image', {}).get('cuts') or [] + if isinstance(cuts, dict): + cuts = cuts.values() + for thumbnail_data in cuts: + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text - else: - video_url = initial_video_url - - join = compat_urlparse.urljoin - ret = { - 'id': video_id, - 'title': info['name'], - 'url': video_url, - 'description': info['description'], - 'duration': int(info['duration']), - 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), - 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), - } - if video_url.startswith('rtmp:'): - mobj = re.match(r'(?Prtmp://[^/]+/(?P[a-z0-9/]+))/(?Pmp4:.*)', video_url) - ret.update({ - 'tc_url': mobj.group('tc_url'), - 'play_path': mobj.group('play_path'), - 'app': mobj.group('app'), - 'no_resume': True, - }) - return ret - - -class NHLVideocenterIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter' - _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P[-0-9a-zA-Z,]+)' - - _TESTS = [{ - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', - 'md5': 'db704a4ea09e8d3988c85e36cc892d09', - 'info_dict': { - 'id': '453614', - 'ext': 'mp4', - 'title': 'Quick clip: Weise 4-3 goal vs Flames', - 'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.', - 'duration': 18, - 'upload_date': '20131006', - }, - }, { - 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', - 'md5': 'd22e82bc592f52d37d24b03531ee9696', - 'info_dict': { - 'id': '2014020024-628-h', - 'ext': 'mp4', - 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', - 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', - 'duration': 0, - 'upload_date': '20141011', - }, - }, { - 'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802', - 'md5': 'c78fc64ea01777e426cfc202b746c825', - 'info_dict': { - 'id': '58665', - 'ext': 'flv', - 'title': 'Classic Game In Six - April 22, 1979', - 'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.', - 'duration': 400, - 'upload_date': '20100129' - }, - }, { - 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/?id=736722', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en', - 'md5': '076fcb88c255154aacbf0a7accc3f340', - 'info_dict': { - 'id': '2014020299-X-h', - 'ext': 'mp4', - 'title': 'Penguins at Islanders / Game Highlights', - 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014', - 'duration': 268, - 'upload_date': '20141122', - } - }, { - 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4', - 'info_dict': { - 'id': '691469', - 'ext': 'mp4', - 'title': 'RAW | Craig MacTavish Full Press Conference', - 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.', - 'upload_date': '20141205', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - }, { - 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._real_extract_video(video_id) - - -class NHLNewsIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:news' - IE_DESC = 'NHL news' - _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P[-0-9a-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.nhl.com/ice/news.htm?id=750727', - 'md5': '4b3d1262e177687a3009937bd9ec0be8', - 'info_dict': { - 'id': '736722', - 'ext': 'mp4', - 'title': 'Cal Clutterbuck has been fined $2,000', - 'description': 'md5:45fe547d30edab88b23e0dd0ab1ed9e6', - 'duration': 37, - 'upload_date': '20150128', - }, - }, { - # iframe embed - 'url': 'http://sabres.nhl.com/club/news.htm?id=780189', - 'md5': '9f663d1c006c90ac9fb82777d4294e12', - 'info_dict': { - 'id': '836127', - 'ext': 'mp4', - 'title': 'Morning Skate: OTT vs. BUF (9/23/15)', - 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.", - 'duration': 93, - 'upload_date': '20150923', - }, - }] - - def _real_extract(self, url): - news_id = self._match_id(url) - webpage = self._download_webpage(url, news_id) - video_id = self._search_regex( - [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'", - r']+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'], - webpage, 'video id') - return self._real_extract_video(video_id) - - -class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter:category' - IE_DESC = 'NHL videocenter category' - _VALID_URL = r'https?://video\.(?P[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P[0-9]+)(?![&?]id=).*?)?$' - _TEST = { - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999', - 'info_dict': { - 'id': '999', - 'title': 'Highlights', - }, - 'playlist_count': 12, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - team = mobj.group('team') - webpage = self._download_webpage(url, team) - cat_id = self._search_regex( - [r'var defaultCatId = "(.+?)";', - r'{statusIndex:0,index:0,.*?id:(.*?),'], - webpage, 'category id') - playlist_title = self._html_search_regex( - r'tab0"[^>]*?>(.*?)', - webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - - data = compat_urllib_parse_urlencode({ - 'cid': cat_id, - # This is the default value - 'count': 12, - 'ptrs': 3, - 'format': 'json', - }) - path = '/videocenter/servlets/browse?' + data - request_url = compat_urlparse.urljoin(url, path) - response = self._download_webpage(request_url, playlist_title) - response = self._fix_json(response) - if not response.strip(): - self._downloader.report_warning('Got an empty response, trying ' - 'adding the "newvideos" parameter') - response = self._download_webpage(request_url + '&newvideos=true', - playlist_title) - response = self._fix_json(response) - videos = json.loads(response) return { - '_type': 'playlist', - 'title': playlist_title, - 'id': cat_id, - 'entries': [self._extract_video(v) for v in videos], + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, } -class NHLIE(InfoExtractor): +class NHLIE(NHLBaseIE): IE_NAME = 'nhl.com' _VALID_URL = r'https?://(?:www\.)?(?Pnhl|wch2016)\.com/(?:[^/]+/)*c-(?P\d+)' - _SITES_MAP = { - 'nhl': 'nhl', - 'wch2016': 'wch', - } + _CONTENT_DOMAIN = 'nhl.bamcontent.com' _TESTS = [{ # type=video 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', @@ -293,59 +126,3 @@ class NHLIE(InfoExtractor): 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', 'only_matching': True, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tmp_id, site = mobj.group('id'), mobj.group('site') - video_data = self._download_json( - 'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json' - % (self._SITES_MAP[site], tmp_id), tmp_id) - if video_data.get('type') == 'article': - video_data = video_data['media'] - - video_id = compat_str(video_data['id']) - title = video_data['title'] - - formats = [] - for playback in video_data.get('playbacks', []): - playback_url = playback.get('url') - if not playback_url: - continue - ext = determine_ext(playback_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False) - self._check_formats(m3u8_formats, video_id) - formats.extend(m3u8_formats) - else: - height = int_or_none(playback.get('height')) - formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), - 'url': playback_url, - 'width': int_or_none(playback.get('width')), - 'height': height, - }) - self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id')) - - thumbnails = [] - for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items(): - thumbnail_url = thumbnail_data.get('src') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_data.get('width')), - 'height': int_or_none(thumbnail_data.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('date')), - 'duration': parse_duration(video_data.get('duration')), - 'thumbnails': thumbnails, - 'formats': formats, - } From acca2ac7f3f4c78bce775d47736caa63e6872e26 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 02:50:14 +0100 Subject: [PATCH 203/488] [mlb] improve extraction(closes #16587) --- youtube_dl/extractor/mlb.py | 105 +++++++++--------------------------- 1 file changed, 24 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 675ff6873..b907f6b49 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,96 +1,90 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) +from .nhl import NHLBaseIE -class MLBIE(InfoExtractor): +class MLBIE(NHLBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*mlb\.com/ + (?:[\da-z_-]+\.)*(?Pmlb)\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| + (?:[^/]+/)*c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) - (?Pn?\d+)| - (?:[^/]+/)*(?P[^/]+) + (?P\d+) ) ''' + _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { - 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', 'info_dict': { 'id': '34698933', 'ext': 'mp4', 'title': "Ackley's spectacular catch", 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'duration': 66, - 'timestamp': 1405980600, - 'upload_date': '20140721', + 'timestamp': 1405995000, + 'upload_date': '20140722', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', - 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', + 'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663', + 'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f', 'info_dict': { 'id': '34496663', 'ext': 'mp4', 'title': 'Stanton prepares for Derby', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'duration': 46, - 'timestamp': 1405105800, + 'timestamp': 1405120200, 'upload_date': '20140711', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', - 'md5': '0e6e73d509321e142409b695eadd541f', + 'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115', + 'md5': '99bb9176531adc600b90880fb8be9328', 'info_dict': { 'id': '34578115', 'ext': 'mp4', 'title': 'Cespedes repeats as Derby champ', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'duration': 488, - 'timestamp': 1405399936, + 'timestamp': 1405414336, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', - 'md5': 'b8fd237347b844365d74ea61d4245967', + 'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915', + 'md5': 'da8b57a12b060e7663ee1eebd6f330ec', 'info_dict': { 'id': '34577915', 'ext': 'mp4', 'title': 'Bautista on Home Run Derby', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'duration': 52, - 'timestamp': 1405390722, + 'timestamp': 1405405122, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'aafaf5b0186fee8f32f20508092f8111', + 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', + 'md5': 'e09e37b552351fddbf4d9e699c924d68', 'info_dict': { 'id': '75609783', 'ext': 'mp4', 'title': 'Must C: Pillar climbs for catch', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429124820, + 'timestamp': 1429139220, 'upload_date': '20150415', } }, @@ -111,7 +105,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783', 'only_matching': True, }, { @@ -120,58 +114,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', + 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', 'only_matching': True, } ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if not video_id: - video_path = mobj.group('path') - webpage = self._download_webpage(url, video_path) - video_id = self._search_regex( - [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') - - detail = self._download_xml( - 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' - % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) - - title = detail.find('./headline').text - description = detail.find('./big-blurb').text - duration = parse_duration(detail.find('./duration').text) - timestamp = parse_iso8601(detail.attrib['date'][:-5]) - - thumbnails = [{ - 'url': thumbnail.text, - } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')] - - formats = [] - for media_url in detail.findall('./url'): - playback_scenario = media_url.attrib['playback_scenario'] - fmt = { - 'url': media_url.text, - 'format_id': playback_scenario, - } - m = re.search(r'(?P\d+)K_(?P\d+)X(?P\d+)', playback_scenario) - if m: - fmt.update({ - 'vbr': int(m.group('vbr')) * 1000, - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } From 3a8e3730c198dd7cb8be76f04d101c66361da6b9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 11:40:37 +0100 Subject: [PATCH 204/488] [francetv] add support for sport.francetvinfo.fr(closes #15645) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/francetv.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 93b22a8c3..b05afd101 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -381,6 +381,7 @@ from .francetv import ( FranceTVSiteIE, FranceTVEmbedIE, FranceTVInfoIE, + FranceTVInfoSportIE, FranceTVJeunesseIE, GenerationWhatIE, CultureboxIE, diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index c02cd03de..6fc6b0da0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -379,6 +379,31 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): return self._make_url_result(video_id, catalogue) +class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): + IE_NAME = 'sport.francetvinfo.fr' + _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', + 'info_dict': { + 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', + 'ext': 'mp4', + 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', + 'timestamp': 1523639962, + 'upload_date': '20180413', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') + return self._make_url_result(video_id, 'Sport-web') + + class GenerationWhatIE(InfoExtractor): IE_NAME = 'france2.fr:generation-what' _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P[^/?#&]+)' From c3f75e2454051021c33f88c982913cba8c651188 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 12:39:45 +0100 Subject: [PATCH 205/488] [audimedia] fix extraction(closes #15309) --- youtube_dl/extractor/audimedia.py | 48 +++++++++++++++++-------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index aa6925623..6bd48ef15 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -5,13 +5,12 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, - sanitized_Request, ) class AudiMediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P[^/?#]+)' + _TESTS = [{ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { @@ -24,41 +23,46 @@ class AudiMediaIE(InfoExtractor): 'duration': 74022, 'view_count': int, } - } - # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) - _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' + }, { + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) raw_payload = self._search_regex([ - r'class="amtv-embed"[^>]+id="([^"]+)"', - r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', + r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"', + r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"', + r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"', + r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})', ], webpage, 'raw payload') - _, stage_mode, video_id, lang = raw_payload.split('-') + _, stage_mode, video_id, _ = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) if stage_mode not in ('s', 'e'): - request = sanitized_Request( - 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), - headers={'X-Auth-Token': self._AUTH_TOKEN}) - json_data = self._download_json(request, video_id)['results'] + video_data = self._download_json( + 'https://www.audimedia.tv/api/video/v1/videos/' + video_id, + video_id, query={ + 'embed[]': ['video_versions', 'thumbnail_image'], + })['results'] formats = [] - stream_url_hls = json_data.get('stream_url_hls') + stream_url_hls = video_data.get('stream_url_hls') if stream_url_hls: formats.extend(self._extract_m3u8_formats( stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - stream_url_hds = json_data.get('stream_url_hds') + stream_url_hds = video_data.get('stream_url_hds') if stream_url_hds: formats.extend(self._extract_f4m_formats( stream_url_hds + '?hdcore=3.4.0', video_id, f4m_id='hds', fatal=False)) - for video_version in json_data.get('video_versions'): + for video_version in video_data.get('video_versions', []): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue @@ -79,11 +83,11 @@ class AudiMediaIE(InfoExtractor): return { 'id': video_id, - 'title': json_data['title'], - 'description': json_data.get('subtitle'), - 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), - 'timestamp': parse_iso8601(json_data.get('publication_date')), - 'duration': int_or_none(json_data.get('duration')), - 'view_count': int_or_none(json_data.get('view_count')), + 'title': video_data['title'], + 'description': video_data.get('subtitle'), + 'thumbnail': video_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(video_data.get('publication_date')), + 'duration': int_or_none(video_data.get('duration')), + 'view_count': int_or_none(video_data.get('view_count')), 'formats': formats, } From 0bfdcc14956557294d8b5ab7309a5f31b3710888 Mon Sep 17 00:00:00 2001 From: DroidFreak32 Date: Thu, 31 May 2018 20:31:44 +0530 Subject: [PATCH 206/488] [openload] Add support for oload.win and oload.download --- youtube_dl/extractor/openload.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d0bdd60b8..702f86b44 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -301,6 +301,12 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', 'only_matching': True, + }, { + 'url': 'https://oload.win/f/kUEfGclsU9o', + 'only_matching': True, + }, { + 'url': 'https://oload.download/f/kUEfGclsU9o', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From 2593725a9bd1347ab54435dc0b48dd7b878f38c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jun 2018 05:16:00 +0700 Subject: [PATCH 207/488] [twitter:card] Add support for another endpoint (closes #16586) --- youtube_dl/extractor/twitter.py | 49 +++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index d7e425041..4a77e792e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -63,7 +63,7 @@ class TwitterCardIE(TwitterBaseIE): 'id': '623160978427936768', 'ext': 'mp4', 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg', + 'thumbnail': r're:^https?://.*$', }, }, { @@ -223,15 +223,38 @@ class TwitterCardIE(TwitterBaseIE): formats.extend(self._extract_mobile_formats(username, video_id)) if formats: + title = self._search_regex(r'([^<]+)', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration'), scale=1000) or duration break + if not formats: + config = self._download_json( + 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % video_id, + video_id, headers={ + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE', + }) + track = config['track'] + vmap_url = track.get('vmapUrl') + if vmap_url: + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + else: + playback_url = track['playbackUrl'] + if determine_ext(playback_url) == 'm3u8': + formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + else: + formats = [{ + 'url': playback_url, + }] + title = 'Twitter web player' + thumbnail = config.get('posterImage') + duration = float_or_none(track.get('durationMs'), scale=1000) + self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex(r'([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - return { 'id': video_id, 'title': title, @@ -375,6 +398,22 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + # card via api.twitter.com/1.1/videos/tweet/config + 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', + 'info_dict': { + 'id': '1001551623938805763', + 'ext': 'mp4', + 'title': 're:.*?Shep is on a roll today.*?', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'uploader': 'Lis Power', + 'uploader_id': 'LisPower1', + 'duration': 111.278, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): From 926d97fc6b018a25ea777dfcfb9a84a10920c2b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jun 2018 05:17:49 +0700 Subject: [PATCH 208/488] [9c9media] PEP 8 --- youtube_dl/extractor/ninecninemedia.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 875665d43..65754c5e7 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( parse_iso8601, float_or_none, From 85750f897293b5a56e6be521f8b0be3eec082899 Mon Sep 17 00:00:00 2001 From: Enes Date: Fri, 1 Jun 2018 20:16:22 +0300 Subject: [PATCH 209/488] [openload] Improve ext extraction --- test/test_utils.py | 1 + youtube_dl/extractor/openload.py | 7 +++++-- youtube_dl/utils.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index f2b51131c..e63af0166 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -361,6 +361,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') + self.assertEqual(determine_ext('foobar', None), None) def test_find_xpath_attr(self): testxml = ''' diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 702f86b44..d264fe206 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -307,6 +307,10 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.download/f/kUEfGclsU9o', 'only_matching': True, + }, { + # Its title has not got its extension but url has it + 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @@ -368,8 +372,7 @@ class OpenloadIE(InfoExtractor): 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, - # Seems all videos have extensions in their titles - 'ext': determine_ext(title, 'mp4'), + 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), 'subtitles': subtitles, 'http_headers': headers, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 63f24c0b6..6a3199fb9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1228,7 +1228,7 @@ def unified_timestamp(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): - if url is None: + if url is None or '.' not in url: return default_ext guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): From b995043ab8b987cb5d4d83a3b56bb28d009ac0cb Mon Sep 17 00:00:00 2001 From: Logan Fleur Date: Fri, 1 Jun 2018 19:18:57 +0200 Subject: [PATCH 210/488] Ignore venv directory --- .gitignore | 1 + setup.cfg | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index fbf7cecb2..f064a0d9e 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,4 @@ youtube-dl.zsh *.iml tmp/ +venv/ diff --git a/setup.cfg b/setup.cfg index 5208f7ae2..af9a554c6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv ignore = E402,E501,E731,E741 From f20f636596aa4ec949360e7b05f6b9499e28c2a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 00:35:07 +0700 Subject: [PATCH 211/488] [cbc] Improve extraction (closes #16583, closes #16593) --- youtube_dl/extractor/cbc.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index ce8e3d346..43f95c739 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -17,6 +17,7 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + orderedSet, parse_duration, parse_iso8601, parse_age_limit, @@ -136,9 +137,15 @@ class CBCIE(InfoExtractor): entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) entries.extend([ self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) + for media_id in orderedSet(media_ids)]) return self.playlist_result( entries, display_id, strip_or_none(title), self._og_search_description(webpage)) From 9d082e7cb81c63dc5c2e616cd7ca7237a5e0d642 Mon Sep 17 00:00:00 2001 From: Nathan Rossi Date: Sat, 26 May 2018 02:34:22 +1000 Subject: [PATCH 212/488] [facebook] Add support for tahoe player videos (closes #15441) Specific videos appear to use a newer/different player, this requires a second request for the video data as the initial request is missing the specified data. Additionally these videos have different page content for the uploader value, which is stored in the `` element of the initial request. --- youtube_dl/extractor/facebook.py | 38 +++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 0971ce356..8bbca4f56 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -56,6 +56,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -208,6 +209,17 @@ class FacebookIE(InfoExtractor): # no title 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', + 'info_dict': { + 'id': '359649331226507', + 'ext': 'mp4', + 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'uploader': 'ESL One Dota 2', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -323,6 +335,24 @@ class FacebookIE(InfoExtractor): server_js_data, lambda x: x['jsmods']['instances'], list) or []) + if not video_data: + # video info not in first request, do a secondary request using tahoe player specific url + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__user': 0, + '__a': 1, + '__pc': self._search_regex(r'"pkg_cohort":"(.*?)"', webpage, 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex(r'"client_revision":(\d+),', webpage, 'client revision', default=3944515), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json(self._search_regex( + r'for \(;;\);(.+)', tahoe_data, + 'tahoe js data', default='{}'), video_id, fatal=False) + video_data = extract_video_data(tahoe_js_data.get('jsmods', {}).get('instances', [])) + if not video_data: if not fatal_if_no_video: return webpage, False @@ -378,9 +408,11 @@ class FacebookIE(InfoExtractor): video_title = limit_length(video_title, 80) else: video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id( - 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + if not uploader: + uploader = self._search_regex( + [r'ownerName\s*:\s*"([^"]+)"', r'property="og:title"\s*content="(.*?)"'], + webpage, 'uploader', fatal=False) timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) From 9b89daefa6fc3dbfce0d725283ecef753a8603ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 01:32:18 +0700 Subject: [PATCH 213/488] [facebook] Improve extraction (closes #16554) --- youtube_dl/extractor/facebook.py | 66 ++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8bbca4f56..8a9ed96c2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -324,34 +324,18 @@ class FacebookIE(InfoExtractor): if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) + def extract_from_jsmods_instances(js_data): + if js_data: + return extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) + if not video_data: server_js_data = self._parse_json( self._search_regex( r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) - if server_js_data: - video_data = extract_video_data(try_get( - server_js_data, lambda x: x['jsmods']['instances'], - list) or []) - - if not video_data: - # video info not in first request, do a secondary request using tahoe player specific url - tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, - data=urlencode_postdata({ - '__user': 0, - '__a': 1, - '__pc': self._search_regex(r'"pkg_cohort":"(.*?)"', webpage, 'pkg cohort', default='PHASED:DEFAULT'), - '__rev': self._search_regex(r'"client_revision":(\d+),', webpage, 'client revision', default=3944515), - }), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - tahoe_js_data = self._parse_json(self._search_regex( - r'for \(;;\);(.+)', tahoe_data, - 'tahoe js data', default='{}'), video_id, fatal=False) - video_data = extract_video_data(tahoe_js_data.get('jsmods', {}).get('instances', [])) + video_data = extract_from_jsmods_instances(server_js_data) if not video_data: if not fatal_if_no_video: @@ -363,8 +347,33 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - else: - raise ExtractorError('Cannot parse data') + + # Video info not in first request, do a secondary request using + # tahoe player specific URL + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__user': 0, + '__a': 1, + '__pc': self._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex( + r'client_revision["\']\s*:\s*(\d+),', webpage, + 'client revision', default='3944515'), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) + + if not video_data: + raise ExtractorError('Cannot parse data') formats = [] for f in video_data: @@ -408,11 +417,10 @@ class FacebookIE(InfoExtractor): video_title = limit_length(video_title, 80) else: video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - if not uploader: - uploader = self._search_regex( - [r'ownerName\s*:\s*"([^"]+)"', r'property="og:title"\s*content="(.*?)"'], - webpage, 'uploader', fatal=False) + uploader = clean_html(get_element_by_id( + 'fbPhotoPageAuthorName', webpage)) or self._search_regex( + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', + fatal=False) or self._og_search_title(webpage, fatal=False) timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) From 73c938e46055690646ef6b81ed53cf1a0bd976a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 01:49:48 +0700 Subject: [PATCH 214/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4e989caf7..f75cb6f11 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version + +Core +* [utils] Improve determine_ext + +Extractors ++ [facebook] Add support for tahoe player videos (#15441, #16554) +* [cbc] Improve extraction (#16583, #16593) +* [openload] Improve ext extraction (#16595) ++ [twitter:card] Add support for another endpoint (#16586) ++ [openload] Add support for oload.win and oload.download (#16592) +* [audimedia] Fix extraction (#15309) ++ [francetv] Add support for sport.francetvinfo.fr (#15645) +* [mlb] Improve extraction (#16587) +- [nhl] Remove old extractors +* [rbmaradio] Check formats availability (#16585) + + version 2018.05.30 Core From 19e42ead9b6dc933fd6cc1adb6b10e2869ecb091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 01:51:31 +0700 Subject: [PATCH 215/488] release 2018.06.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 4 +--- youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b47a450a4..10efa29a4 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.30** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.30 +[debug] youtube-dl version 2018.06.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index f75cb6f11..edd190051 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.06.02 Core * [utils] Improve determine_ext diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c2d5401d6..8ce13581b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -553,9 +553,6 @@ - **nfl.com** - **NhkVod** - **nhl.com** - - **nhl.com:news**: NHL news - - **nhl.com:videocenter** - - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** - **nickelodeon:br** @@ -793,6 +790,7 @@ - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** + - **sport.francetvinfo.fr** - **Sport5** - **SportBoxEmbed** - **SportDeutschland** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0f15738b2..5ace1b355 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.30' +__version__ = '2018.06.02' From 1ea559c445d6612968b7586355f236cabfd42ef5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Jun 2018 18:07:36 +0100 Subject: [PATCH 216/488] [adn] fix extraction --- youtube_dl/extractor/adn.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 041c61aff..1eb99c39a 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import binascii import json import os +import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt @@ -12,9 +15,12 @@ from ..compat import ( ) from ..utils import ( bytes_to_intlist, + bytes_to_long, ExtractorError, float_or_none, intlist_to_bytes, + long_to_bytes, + pkcs1pad, srt_subtitles_timecode, strip_or_none, urljoin, @@ -35,6 +41,7 @@ class ADNIE(InfoExtractor): } } _BASE_URL = 'http://animedigitalnetwork.fr' + _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) def _get_subtitles(self, sub_path, video_id): if not sub_path: @@ -42,16 +49,14 @@ class ADNIE(InfoExtractor): enc_subtitles = self._download_webpage( urljoin(self._BASE_URL, sub_path), - video_id, fatal=False, headers={ - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', - }) + video_id, fatal=False) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), + bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -112,11 +117,24 @@ class ADNIE(InfoExtractor): error = None if not links: links_url = player_config.get('linksurl') or options['videoUrl'] - links_data = self._download_json(urljoin( - self._BASE_URL, links_url), video_id) + token = options['token'] + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 'e': 60, + 't': token, + })) + padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + n, e = self._RSA_KEY + encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) + authorization = base64.b64encode(encrypted_message).decode() + links_data = self._download_json( + urljoin(self._BASE_URL, links_url), video_id, headers={ + 'Authorization': 'Bearer ' + authorization, + }) links = links_data.get('links') or {} metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') + sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token error = links_data.get('error') title = metas.get('title') or video_info['title'] From 003fe73ccf06192bb94d524fe9c39252ff1b1dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jun 2018 00:52:22 +0700 Subject: [PATCH 217/488] [safari] Add support for new URL schema (closes #16614) --- youtube_dl/extractor/safari.py | 67 +++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 8a5d48fc2..30e2a38b4 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -74,7 +74,14 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P[^/]+)/(?P[^/?#&]+)\.html' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) + ) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -94,22 +101,41 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, }] + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, video_id) - reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'kaltura reference id', group='id') - partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'kaltura widget id', group='id') - ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'kaltura uiconf id', group='id') + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') query = { 'wid': '_%s' % partner_id, @@ -159,10 +185,15 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)| + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| techbus\.safaribooksonline\.com ) - /(?P[^/]+)/?(?:[#?]|$) + /(?P[^/]+) ''' _TESTS = [{ @@ -179,8 +210,16 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'http://techbus.safaribooksonline.com/9780134426365', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + def _real_extract(self, url): course_id = self._match_id(url) From 936784b272db3f85f5ff5bdd2d5a71e0397ee7bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jun 2018 02:05:14 +0700 Subject: [PATCH 218/488] [youtube] Extract track and artist --- youtube_dl/extractor/youtube.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 379559825..677907aba 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -510,6 +510,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'license': 'Standard YouTube License', 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -528,6 +530,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision`', + 'artist': 'Justin Timberlake', 'age_limit': 18, } }, @@ -1765,6 +1769,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_alt_title = video_creator = None + def extract_meta(field): + return self._html_search_regex( + r']+class="title"[^>]*>\s*%s\s*\s*]*>\s*
  • (.+?)
  • \s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + m_episode = re.search( r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)
    ', video_webpage) @@ -2055,9 +2067,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': video_uploader_url, 'upload_date': upload_date, 'license': video_license, - 'creator': video_creator, + 'creator': video_creator or artist, 'title': video_title, - 'alt_title': video_alt_title, + 'alt_title': video_alt_title or track, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, @@ -2080,6 +2092,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'series': series, 'season_number': season_number, 'episode_number': episode_number, + 'track': track, + 'artist': artist, } From 7e72694b5e0691adfd90f5d5ecd47647625511e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jun 2018 02:08:38 +0700 Subject: [PATCH 219/488] [youtube] Move metadata extraction after video availability check --- youtube_dl/extractor/youtube.py | 259 ++++++++++++++++---------------- 1 file changed, 128 insertions(+), 131 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 677907aba..b8cea1191 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -530,7 +530,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision`', + 'track': 'Tunnel Vision', 'artist': 'Justin Timberlake', 'age_limit': 18, } @@ -1698,136 +1698,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) - - track = extract_meta('Song') - artist = extract_meta('Artist') - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = m_episode.group('series') - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - chapters = self._extract_chapters(description_original, video_duration) - def _extract_filesize(media_url): return int_or_none(self._search_regex( r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) @@ -2002,6 +1872,133 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + # uploader + video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') + + # uploader_id + video_uploader_id = None + video_uploader_url = None + mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + video_webpage) + if mobj is not None: + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') + else: + self._downloader.report_warning('unable to extract uploader nickname') + + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') + video_thumbnail = None + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = self._html_search_meta( + 'datePublished', video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], + video_webpage, 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + video_license = self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', + video_webpage, 'license', default=None) + + m_music = re.search( + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = m_episode.group('series') + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + + m_cat_container = self._search_regex( + r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', + video_webpage, 'categories', default=None) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + + def _extract_count(count_name): + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') + + # subtitles + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) + + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + + chapters = self._extract_chapters(description_original, video_duration) + # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd_fatal = True From eb6793ba970351ecc8f8a579ff4e4665fb649f9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 02:23:45 +0700 Subject: [PATCH 220/488] [youtube] Update tests --- youtube_dl/extractor/youtube.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b8cea1191..89c8b7f8d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -601,7 +601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'IB3lcPjvWLA', 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'description': 'md5:1900ed86ee514927b9e00fbead6969a5', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', @@ -642,7 +642,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'The Amazing Atheist', + 'uploader': 'TJ Kirk', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'license': 'Standard YouTube License', @@ -672,10 +672,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 247, + 'duration': 246, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', @@ -737,7 +737,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫艾倫', + 'uploader': '孫ᄋᄅ', 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, @@ -764,7 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, @@ -773,8 +773,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:32', + 'formats': 'mincount:31', }, + 'skip': 'not actual anymore', }, # DASH manifest with segment_list { @@ -889,7 +890,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk', + 'alt_title': 'Dark Walk - Position Music', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', @@ -897,7 +898,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', 'license': 'Standard YouTube License', - 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', + 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk - Position Music', + 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', }, 'params': { 'skip_download': True, @@ -954,7 +957,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:dda0d780d5a6e120758d1711d062a867', 'duration': 4060, 'upload_date': '20151119', - 'uploader': 'Bernie 2016', + 'uploader': 'Bernie Sanders', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', @@ -989,6 +992,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # YouTube Red video with episode data @@ -997,7 +1001,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'description': 'md5:25b78d2f64ae81719f5c96319889b736', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', @@ -1030,7 +1034,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', 'license': 'Standard YouTube License', - 'view_count': int, }, 'params': { 'skip_download': True, From 6d155707e67dc9e3bc4e118dc7d6a06bf8af471f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 04:07:59 +0700 Subject: [PATCH 221/488] [bbc] Add support for bbcthree (closes #16612) --- youtube_dl/extractor/bbc.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 8b20c03d6..30a63a24e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..utils import ( float_or_none, get_element_by_class, int_or_none, + js_to_json, parse_duration, parse_iso8601, try_get, @@ -772,6 +773,17 @@ class BBCIE(BBCCoUkIE): # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } }] @classmethod @@ -994,6 +1006,36 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) + if bbc3_config: + bbc3_playlist = try_get( + bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), From 0a10f50e2f1fb8ed80a3707970ba44593cfff8eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 04:30:33 +0700 Subject: [PATCH 222/488] [chaturbate] Use geo verification headers --- youtube_dl/extractor/chaturbate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index e3eba4be9..e2b828d8a 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -31,7 +31,8 @@ class ChaturbateIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url, video_id, headers=self.geo_verification_headers()) m3u8_urls = [] From b6b2ccb72fb7da7563078d4bf047d1622ba89553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 15:53:20 +0700 Subject: [PATCH 223/488] [twitter:card] Extract guest token (closes #16609) --- youtube_dl/extractor/twitter.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 4a77e792e..f3fccbf1d 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -229,11 +229,22 @@ class TwitterCardIE(TwitterBaseIE): break if not formats: + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + 'Referer': url, + } + ct0 = self._get_cookies(url).get('ct0') + if ct0: + headers['csrf_token'] = ct0.value + guest_token = self._download_json( + 'https://api.twitter.com/1.1/guest/activate.json', video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = guest_token + self._set_cookie('api.twitter.com', 'gt', guest_token) config = self._download_json( 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % video_id, - video_id, headers={ - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE', - }) + video_id, headers=headers) track = config['track'] vmap_url = track.get('vmapUrl') if vmap_url: From 77053237c56341806c759f809a84975ede8141d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 15:58:12 +0700 Subject: [PATCH 224/488] [twitter:card] Generalize base API URL --- youtube_dl/extractor/twitter.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index f3fccbf1d..de41065d6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -108,6 +108,8 @@ class TwitterCardIE(TwitterBaseIE): }, ] + _API_BASE = 'https://api.twitter.com/1.1' + def _parse_media_info(self, media_info, video_id): formats = [] for media_variant in media_info.get('variants', []): @@ -149,7 +151,7 @@ class TwitterCardIE(TwitterBaseIE): main_script, 'bearer token') # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id api_data = self._download_json( - 'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, + '%s/statuses/show/%s.json' % (self._API_BASE, video_id), video_id, 'Downloading API data', headers={ 'Authorization': 'Bearer ' + bearer_token, @@ -237,13 +239,13 @@ class TwitterCardIE(TwitterBaseIE): if ct0: headers['csrf_token'] = ct0.value guest_token = self._download_json( - 'https://api.twitter.com/1.1/guest/activate.json', video_id, + '%s/guest/activate.json' % self._API_BASE, video_id, 'Downloading guest token', data=b'', headers=headers)['guest_token'] headers['x-guest-token'] = guest_token self._set_cookie('api.twitter.com', 'gt', guest_token) config = self._download_json( - 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % video_id, + '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), video_id, headers=headers) track = config['track'] vmap_url = track.get('vmapUrl') From c3023e9f2e9e84942df0789b8bc799b6ea51d690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 17:09:20 +0700 Subject: [PATCH 225/488] [camtube] Add extractor --- youtube_dl/extractor/camtube.py | 69 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/camtube.py diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py new file mode 100644 index 000000000..c7d40f849 --- /dev/null +++ b/youtube_dl/extractor/camtube.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b05afd101..6df829054 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -147,6 +147,7 @@ from .camdemy import ( CamdemyFolderIE ) from .cammodels import CamModelsIE +from .camtube import CamTubeIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE From c6c478f40d4afa3fb97bf25eeff902ce95a5aa88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 02:16:33 +0700 Subject: [PATCH 226/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index edd190051..db6f8a3ad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version <unreleased> + +Extractors ++ [camtube] Add support for camtube.co ++ [twitter:card] Extract guest token (#16609) ++ [chaturbate] Use geo verification headers ++ [bbc] Add support for bbcthree (#16612) +* [youtube] Move metadata extraction after video availability check ++ [youtube] Extract track and artist ++ [safari] Add support for new URL schema (#16614) +* [adn] Fix extraction + + version 2018.06.02 Core From f7560859a3e25ccaa74123428d42f821299a2bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 02:33:54 +0700 Subject: [PATCH 227/488] [devscripts/update-copyright] Update copyright year --- devscripts/gh-pages/update-copyright.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devscripts/gh-pages/update-copyright.py b/devscripts/gh-pages/update-copyright.py index e6c3abc8d..61487f925 100755 --- a/devscripts/gh-pages/update-copyright.py +++ b/devscripts/gh-pages/update-copyright.py @@ -13,7 +13,7 @@ year = str(datetime.datetime.now().year) for fn in glob.glob('*.html*'): with io.open(fn, encoding='utf-8') as f: content = f.read() - newc = re.sub(r'(?P<copyright>Copyright © 2006-)(?P<year>[0-9]{4})', 'Copyright © 2006-' + year, content) + newc = re.sub(r'(?P<copyright>Copyright © 2011-)(?P<year>[0-9]{4})', 'Copyright © 2011-' + year, content) if content != newc: tmpFn = fn + '.part' with io.open(tmpFn, 'wt', encoding='utf-8') as outf: From 94418c8eb3e458a16fa7301c0987dec5f04faa6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 02:41:53 +0700 Subject: [PATCH 228/488] release 2018.06.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 10efa29a4..aa7686efd 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.02 +[debug] youtube-dl version 2018.06.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index db6f8a3ad..5375e03fc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.04 Extractors + [camtube] Add support for camtube.co diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8ce13581b..e1a9f2236 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -129,6 +129,7 @@ - **Camdemy** - **CamdemyFolder** - **CamModels** + - **CamTube** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5ace1b355..ab3419f0c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.02' +__version__ = '2018.06.04' From 2e190c2ad9a985940fa0ca2cb7a09398319dd2c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 23:51:25 +0700 Subject: [PATCH 229/488] [rbmaradio] Add support for 192k format (closes #16631) --- youtube_dl/extractor/rbmaradio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 9c4d72bbd..ae7413fb5 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -53,7 +53,7 @@ class RBMARadioIE(InfoExtractor): 'format_id': compat_str(abr), 'abr': abr, 'vcodec': 'none', - } for abr in (96, 128, 256)] + } for abr in (96, 128, 192, 256)] self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) From d7be7053082055a001b788453d66131a62692b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Jun 2018 00:17:26 +0700 Subject: [PATCH 230/488] [pbs] Add another cove id pattern (closes #15373) --- youtube_dl/extractor/pbs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a28ee17ca..8d6f2dd3d 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -360,6 +360,21 @@ class PBSIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', + 'info_dict': { + 'id': '2365936247', + 'ext': 'mp4', + 'title': 'Antiques Roadshow - Indianapolis, Hour 2', + 'description': 'md5:524b32249db55663e7231b6b8d1671a2', + 'duration': 3180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -422,6 +437,7 @@ class PBSIE(InfoExtractor): r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", + r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ ] media_id = self._search_regex( From 06ea7bdd99c16b718c977a652f0ef66341ac2b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Jun 2018 02:55:54 +0700 Subject: [PATCH 231/488] [nexx] Add support for free cdn (closes #16538) --- youtube_dl/extractor/nexx.py | 209 ++++++++++++++++++++++++----------- 1 file changed, 144 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 5e46a75c0..6f40d7f89 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -77,6 +77,21 @@ class NexxIE(InfoExtractor): 'timestamp': 1518614955, 'upload_date': '20180214', }, + }, { + # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html + 'url': 'nexx:747:1533779', + 'md5': '6bf6883912b82b7069fb86c2297e9893', + 'info_dict': { + 'id': '1533779', + 'ext': 'mp4', + 'title': 'Aufregung um ausgebrochene Raubtiere', + 'alt_title': 'Eifel-Zoo', + 'description': 'md5:f21375c91c74ad741dcb164c427999d2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 111, + 'timestamp': 1527874460, + 'upload_date': '20180601', + }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -141,6 +156,127 @@ class NexxIE(InfoExtractor): self._handle_error(result) return result['result'] + def _extract_free_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'free' + + hash = video['general']['hash'] + + ps = compat_str(stream_data['originalDomain']) + if stream_data['applyFolderHierarchy'] == 1: + s = ('%04d' % int(video_id))[::-1] + ps += '/%s/%s' % (s[0:2], s[2:4]) + ps += '/%s/%s_' % (video_id, hash) + + formats = [{ + 'url': 'http://%s%s2500_var.mp4' % (stream_data['cdnPathHTTP'], ps), + 'format_id': '%s-http' % cdn, + }] + + def make_url(root, protocol): + t = 'http://' + root + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] + + def p0(p): + return '_%s' % int(p[0]) if stream_data['applyAzureStructure'] == 1 else '' + + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(p) + ',' + t += '.mp4.csmil/master.m3u8' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + t = '/'.join(k) + t += '/asset.ism/manifest.' + ('m3u8' if protocol == 'hls' else 'mpd') + '?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + a = '%s%s%s.mp4:%s' % (h, p[1], p0(p), int(p[0]) * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + return t + + formats.extend(self._extract_mpd_formats( + make_url(stream_data['cdnPathDASH'], 'dash'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_m3u8_formats( + make_url(stream_data['cdnPathHLS'], 'hls'), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) + + return formats + + def _extract_azure_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'azure' + + azure_locator = stream_data['azureLocator'] + + def get_cdn_shield_base(shield_type='', static=False): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + + language = video['general'].get('language_raw') or '' + + azure_stream_base = get_cdn_shield_base() + is_ml = ',' in language + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' + + protection_token = try_get( + video, lambda x: x['protectiondata']['token'], compat_str) + if protection_token: + azure_manifest_url += '?hdnts=%s' % protection_token + + formats = self._extract_m3u8_formats( + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', True) + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') @@ -220,72 +356,15 @@ class NexxIE(InfoExtractor): general = video['general'] title = general['title'] - stream_data = video['streamdata'] - language = general.get('language_raw') or '' + cdn = video['streamdata']['cdnType'] - # TODO: reverse more cdns - - cdn = stream_data['cdnType'] - assert cdn == 'azure' - - azure_locator = stream_data['azureLocator'] - - def get_cdn_shield_base(shield_type='', static=False): - for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) - if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) - else: - if 'fb' in stream_data['azureAccount']: - prefix = 'df' if static else 'f' - else: - prefix = 'd' if static else 'p' - account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) - return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) - - azure_stream_base = get_cdn_shield_base() - is_ml = ',' in language - azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( - azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' - - protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) - if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token - - formats = self._extract_m3u8_formats( - azure_manifest_url % '(format=m3u8-aapl)', - video_id, 'mp4', 'm3u8_native', - m3u8_id='%s-hls' % cdn, fatal=False) - formats.extend(self._extract_mpd_formats( - azure_manifest_url % '(format=mpd-time-csf)', - video_id, mpd_id='%s-dash' % cdn, fatal=False)) - formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - - azure_progressive_base = get_cdn_shield_base('Prog', True) - azure_file_distribution = stream_data.get('azureFileDistribution') - if azure_file_distribution: - fds = azure_file_distribution.split(',') - if fds: - for fd in fds: - ss = fd.split(':') - if len(ss) == 2: - tbr = int_or_none(ss[0]) - if tbr: - f = { - 'url': '%s%s/%s_src_%s_%d.mp4' % ( - azure_progressive_base, azure_locator, video_id, ss[1], tbr), - 'format_id': '%s-http-%d' % (cdn, tbr), - 'tbr': tbr, - } - width_height = ss[1].split('x') - if len(width_height) == 2: - f.update({ - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - }) - formats.append(f) + if cdn == 'azure': + formats = self._extract_azure_formats(video, video_id) + elif cdn == 'free': + formats = self._extract_free_formats(video, video_id) + else: + # TODO: reverse more cdns + assert False self._sort_formats(formats) From 2e6975306a51d3ee7dae71bc93d57951487ea6f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Jun 2018 02:59:25 +0700 Subject: [PATCH 232/488] [nexx] Update tests --- youtube_dl/extractor/nexx.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 6f40d7f89..40946d26b 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -29,14 +29,13 @@ class NexxIE(InfoExtractor): _TESTS = [{ # movie 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '828cea195be04e66057b846288295ba1', + 'md5': '31899fd683de49ad46f4ee67e53e83fe', 'info_dict': { 'id': '128907', 'ext': 'mp4', 'title': 'Stiftung Warentest', 'alt_title': 'Wie ein Test abläuft', 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', - 'release_year': 2013, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2509, @@ -62,6 +61,7 @@ class NexxIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { # does not work via arc 'url': 'nexx:741:1269984', @@ -71,7 +71,6 @@ class NexxIE(InfoExtractor): 'ext': 'mp4', 'title': '1 TAG ohne KLO... wortwörtlich! 😑', 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'description': 'md5:4604539793c49eda9443ab5c5b1d612f', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 607, 'timestamp': 1518614955, From 9afd74d70558d7db9e2e13d2afa84746e61f193c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Jun 2018 01:02:46 +0100 Subject: [PATCH 233/488] [nexx] extract free cdn http formats --- youtube_dl/extractor/nexx.py | 76 +++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 40946d26b..82d526c22 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -168,42 +168,54 @@ class NexxIE(InfoExtractor): ps += '/%s/%s' % (s[0:2], s[2:4]) ps += '/%s/%s_' % (video_id, hash) - formats = [{ - 'url': 'http://%s%s2500_var.mp4' % (stream_data['cdnPathHTTP'], ps), - 'format_id': '%s-http' % cdn, - }] + t = 'http://%s' + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] - def make_url(root, protocol): - t = 'http://' + root + ps - fd = stream_data['azureFileDistribution'].split(',') - cdn_provider = stream_data['cdnProvider'] + def p0(p): + return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' - def p0(p): - return '_%s' % int(p[0]) if stream_data['applyAzureStructure'] == 1 else '' + formats = [] + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(int(p[0])) + ',' + t += '.mp4.csmil/master.%s' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + http_base = t = '/'.join(k) + http_base = http_base % stream_data['cdnPathHTTP'] + t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + tbr = int(p[0]) + filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) + f = { + 'url': http_base + '/' + filename, + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = p[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + a = filename + ':%s' % (tbr * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + else: + assert False - if cdn_provider == 'ak': - t += ',' - for i in fd: - p = i.split(':') - t += p[1] + p0(p) + ',' - t += '.mp4.csmil/master.m3u8' - elif cdn_provider == 'ce': - k = t.split('/') - h = k.pop() - t = '/'.join(k) - t += '/asset.ism/manifest.' + ('m3u8' if protocol == 'hls' else 'mpd') + '?dcp_ver=aos4&videostream=' - for i in fd: - p = i.split(':') - a = '%s%s%s.mp4:%s' % (h, p[1], p0(p), int(p[0]) * 1000) - t += a + ',' - t = t[:-1] + '&audiostream=' + a.split(':')[0] - return t - - formats.extend(self._extract_mpd_formats( - make_url(stream_data['cdnPathDASH'], 'dash'), video_id, - mpd_id='%s-dash' % cdn, fatal=False)) + if cdn_provider == 'ce': + formats.extend(self._extract_mpd_formats( + t % (stream_data['cdnPathDASH'], 'mpd'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) formats.extend(self._extract_m3u8_formats( - make_url(stream_data['cdnPathHLS'], 'hls'), video_id, 'mp4', + t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) return formats From 6ae36035d9ba2eae88070eb647f45cc4ceeb0998 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Jun 2018 00:41:08 +0100 Subject: [PATCH 234/488] [tv4] fix format extraction(closes #16650) --- youtube_dl/extractor/tv4.py | 62 +++++++++++++------------------------ 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index cfcce020a..51923e44a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -1,13 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_iso8601, - try_get, - determine_ext, ) @@ -78,42 +77,25 @@ class TV4IE(InfoExtractor): title = info['title'] - subtitles = {} - formats = [] - # http formats are linked with unresolvable host - for kind in ('hls3', ''): - data = self._download_json( - 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, - video_id, 'Downloading sources JSON', query={ - 'protocol': kind, - 'videoFormat': 'MP4+WEBVTT', - }) - items = try_get(data, lambda x: x['playback']['items']['item']) - if not items: - continue - if isinstance(items, dict): - items = [items] - for item in items: - manifest_url = item.get('url') - if not isinstance(manifest_url, compat_str): - continue - ext = determine_ext(manifest_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=kind, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_akamai_formats( - manifest_url, video_id, { - 'hls': 'tv4play-i.akamaihd.net', - })) - elif ext == 'webvtt': - subtitles = self._merge_subtitles( - subtitles, { - 'sv': [{ - 'url': manifest_url, - 'ext': 'vtt', - }]}) + manifest_url = self._download_json( + 'https://playback-api.b17g.net/media/' + video_id, + video_id, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + })['playbackItem']['manifestUrl'] + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_ism_formats( + re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES) @@ -124,7 +106,7 @@ class TV4IE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, - 'subtitles': subtitles, + # 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), From ff2e4862210d69f2cee509002a9d96ba3a0877d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 9 Jun 2018 02:53:04 +0700 Subject: [PATCH 235/488] [inc] Add support for another embed schema (closes #16666) --- youtube_dl/extractor/inc.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py index 241ec83c4..8dee143ca 100644 --- a/youtube_dl/extractor/inc.py +++ b/youtube_dl/extractor/inc.py @@ -21,6 +21,21 @@ class IncIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # div with id=kaltura_player_1_kqs38cgm + 'url': 'https://www.inc.com/oscar-raymundo/richard-branson-young-entrepeneurs.html', + 'info_dict': { + 'id': '1_kqs38cgm', + 'ext': 'mp4', + 'title': 'Branson: "In the end, you have to say, Screw it. Just do it."', + 'description': 'md5:21b832d034f9af5191ca5959da5e9cb6', + 'timestamp': 1364403232, + 'upload_date': '20130327', + 'uploader_id': 'incdigital@inc.com', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', 'only_matching': True, @@ -31,9 +46,12 @@ class IncIE(InfoExtractor): webpage = self._download_webpage(url, display_id) partner_id = self._search_regex( - r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') + r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, + 'partner id', default='1034971') - kaltura_id = self._parse_json(self._search_regex( + kaltura_id = self._search_regex( + r'id=(["\'])kaltura_player_(?P<id>.+?)\1', webpage, 'kaltura id', + default=None, group='id') or self._parse_json(self._search_regex( r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), display_id)['vid_kaltura_id'] From 9d581efe053a22042ea7530a8e92097d0a91eea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Jun 2018 00:26:16 +0700 Subject: [PATCH 236/488] [npo] Extend _VALID_URL (closes #16682) --- youtube_dl/extractor/npo.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index ff2153387..cb8319f0d 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -36,8 +36,8 @@ class NPOIE(NPOBaseIE): https?:// (?:www\.)? (?: - npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| + npo\.nl/(?:[^/]+/)*| + (?:ntr|npostart)\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| (?:zapp|npo3)\.nl/(?:[^/]+/){2,} ) @@ -160,8 +160,20 @@ class NPOIE(NPOBaseIE): }, { 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996', + 'only_matching': True, + }, { + 'url': 'https://npo.nl/KN_1698996', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if any(ie.suitable(url) + for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) + else super(NPOIE, cls).suitable(url)) + def _real_extract(self, url): video_id = self._match_id(url) return self._get_info(video_id) @@ -389,7 +401,7 @@ class NPOLiveIE(NPOBaseIE): class NPORadioIE(InfoExtractor): IE_NAME = 'npo.nl:radio' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.npo.nl/radio/radio-1', @@ -404,6 +416,10 @@ class NPORadioIE(InfoExtractor): } } + @classmethod + def suitable(cls, url): + return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) + @staticmethod def _html_get_attribute_regex(attribute): return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) From cc37cc3f9935dfcdd3c3b73ee1419c1f8592e056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 01:55:16 +0700 Subject: [PATCH 237/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 5375e03fc..a1db9df4e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Extractors +* [npo] Extend URL regular expression and add support for npostart.nl (#16682) ++ [inc] Add support for another embed schema (#16666) +* [tv4] Fix format extraction (#16650) ++ [nexx] Add support for free cdn (#16538) ++ [pbs] Add another cove id pattern (#15373) ++ [rbmaradio] Add support for 192k format (#16631) + + version 2018.06.04 Extractors From e8c6afc16877a6b975895bf236c8cdaff01733cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 01:57:30 +0700 Subject: [PATCH 238/488] release 2018.06.11 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index aa7686efd..d5be8003b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.11** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.04 +[debug] youtube-dl version 2018.06.11 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a1db9df4e..b808ad6ba 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.11 Extractors * [npo] Extend URL regular expression and add support for npostart.nl (#16682) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ab3419f0c..e72f42cf2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.04' +__version__ = '2018.06.11' From d253df2f65b15d1e3bb5e9703b05d98532337c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 02:40:17 +0700 Subject: [PATCH 239/488] [wimp] Fix Youtube embeds extraction --- youtube_dl/extractor/wimp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index c022fb33e..3dab9145b 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -36,7 +36,8 @@ class WimpIE(InfoExtractor): webpage = self._download_webpage(url, video_id) youtube_id = self._search_regex( - r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: return { From 93cffb1444131375e822b0c01e23dc4819911419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 03:08:36 +0700 Subject: [PATCH 240/488] [nrk] Update API hosts and try all previously known ones (closes #16690) --- youtube_dl/extractor/nrk.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b4f51f61..7157e2390 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -16,12 +16,22 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _api_host = None + def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://%s/mediaelement/%s' % (self._API_HOST, video_id), - video_id, 'Downloading mediaelement JSON') + api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS + + for api_host in api_hosts: + data = self._download_json( + 'http://%s/mediaelement/%s' % (api_host, video_id), + video_id, 'Downloading mediaelement JSON', + fatal=api_host == api_hosts[-1]) + if not data: + continue + self._api_host = api_host + break title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id @@ -191,7 +201,7 @@ class NRKIE(NRKBaseIE): ) (?P<id>[^?#&]+) ''' - _API_HOST = 'v8-psapi.nrk.no' + _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -237,8 +247,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-we.nrk.no' - + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '4e9ca6629f09e588ed240fb11619922a', From b2df66aecab9faea206cb72e715dfa1394a6d182 Mon Sep 17 00:00:00 2001 From: Thomas van der Berg <ik@thomasvanderberg.nl> Date: Wed, 31 Jan 2018 23:00:30 +0100 Subject: [PATCH 241/488] [tvnet] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vtv.py | 91 ++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/vtv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6df829054..e6d1fe70e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1306,6 +1306,7 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtv import VTVIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/vtv.py b/youtube_dl/extractor/vtv.py new file mode 100644 index 000000000..a9683dd85 --- /dev/null +++ b/youtube_dl/extractor/vtv.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re + +from ..utils import extract_attributes + +class VTVIE(InfoExtractor): + _VALID_URL = r'https?://(au|ca|cz|de|jp|kr|tw|us|vn)\.tvnet\.gov\.vn/[^/]*/(?P<id>[0-9]+)/?' + _TESTS = [{ + # Livestream. Channel: VTV 1 + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:https?://.*\.png$', + } + }, { + # Downloading a video. + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': '5263c63d738569ed507980f1e49ebc03', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang - TV Net', + 'thumbnail': r're:https?://.*\.JPG$', + } + }, { + # Radio live stream. Channel: VOV 1 + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'vcodec': 'none', + 'title': r're:VOV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:https?://.*\.png$', + } + + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title>(.+?)', webpage, 'title', default=None, fatal=False) + if title is None: + title = self._og_search_title(webpage) + title.strip() + + mediaplayer_div = self._search_regex(r'(]*id="mediaplayer"[^>]*>)', webpage, 'mediaplayer element') + mediaplayer_div_attributes = extract_attributes(mediaplayer_div) + + thumbnail = mediaplayer_div_attributes.get("data-image") + + json_url = mediaplayer_div_attributes["data-file"] + video_streams = self._download_json(json_url, video_id) + + + # get any working playlist from streams. Currently there's 2 and the first always works, + # but you never know in the future + for stream in video_streams: + formats = self._extract_m3u8_formats(stream.get("url"), video_id, ext="mp4", fatal=False) + if formats: + break + + # better support radio streams + if title.startswith("VOV"): + for f in formats: + f["ext"] = "m4a" + f["vcodec"] = "none" + + if "/video/" in url or "/radio/" in url: + is_live = False + elif "/kenh-truyen-hinh/" in url: + is_live = True + else: + is_live = None + + if is_live: + title = self._live_title(title) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + 'is_live': is_live, + } From a572ae6114deca4e8f2f0365ca7091749f01deaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 01:35:23 +0700 Subject: [PATCH 242/488] [tvnet] Improve and fix issues (closes #15462) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tvnet.py | 133 +++++++++++++++++++++++++++++ youtube_dl/extractor/vtv.py | 91 -------------------- 3 files changed, 134 insertions(+), 92 deletions(-) create mode 100644 youtube_dl/extractor/tvnet.py delete mode 100644 youtube_dl/extractor/vtv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e6d1fe70e..d4583b8e4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1139,6 +1139,7 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE +from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, @@ -1306,7 +1307,6 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE -from .vtv import VTVIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py new file mode 100644 index 000000000..0ec2da4da --- /dev/null +++ b/youtube_dl/extractor/tvnet.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class TVNetIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P[0-9]+)' + _TESTS = [{ + # video + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': 'b4d7abe0252c9b47774760b7519c7558', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + 'view_count': int, + }, + }, { + # audio + 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', + 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', + 'info_dict': { + 'id': '27017', + 'ext': 'm4a', + 'title': 'VOV1 - Bản tin chiều (10/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + }, { + # live stream + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + # radio live stream + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, default=None) or self._search_regex( + r'([^<]+)<', webpage, 'title') + title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + + if '/video/' in url or '/radio/' in url: + is_live = False + elif '/kenh-truyen-hinh/' in url: + is_live = True + else: + is_live = None + + data_file = unescapeHTML(self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'data file', group='url')) + + stream_urls = set() + formats = [] + for stream in self._download_json(data_file, video_id): + if not isinstance(stream, dict): + continue + stream_url = stream.get('url') + if (stream_url in stream_urls or not stream_url or + not isinstance(stream_url, compat_str)): + continue + stream_urls.add(stream_url) + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + # better support for radio streams + if title.startswith('VOV'): + for f in formats: + f.update({ + 'ext': 'm4a', + 'vcodec': 'none', + }) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or unescapeHTML( + self._search_regex( + r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'thumbnail', default=None, group='url')) + + if is_live: + title = self._live_title(title) + + view_count = int_or_none(self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', + webpage, 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/vtv.py b/youtube_dl/extractor/vtv.py deleted file mode 100644 index a9683dd85..000000000 --- a/youtube_dl/extractor/vtv.py +++ /dev/null @@ -1,91 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -import re - -from ..utils import extract_attributes - -class VTVIE(InfoExtractor): - _VALID_URL = r'https?://(au|ca|cz|de|jp|kr|tw|us|vn)\.tvnet\.gov\.vn/[^/]*/(?P<id>[0-9]+)/?' - _TESTS = [{ - # Livestream. Channel: VTV 1 - 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', - 'info_dict': { - 'id': '1011', - 'ext': 'mp4', - 'title': r're:^VTV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:https?://.*\.png$', - } - }, { - # Downloading a video. - 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', - 'md5': '5263c63d738569ed507980f1e49ebc03', - 'info_dict': { - 'id': '109788', - 'ext': 'mp4', - 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang - TV Net', - 'thumbnail': r're:https?://.*\.JPG$', - } - }, { - # Radio live stream. Channel: VOV 1 - 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', - 'info_dict': { - 'id': '1014', - 'ext': 'm4a', - 'vcodec': 'none', - 'title': r're:VOV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:https?://.*\.png$', - } - - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'<title>(.+?)', webpage, 'title', default=None, fatal=False) - if title is None: - title = self._og_search_title(webpage) - title.strip() - - mediaplayer_div = self._search_regex(r'(]*id="mediaplayer"[^>]*>)', webpage, 'mediaplayer element') - mediaplayer_div_attributes = extract_attributes(mediaplayer_div) - - thumbnail = mediaplayer_div_attributes.get("data-image") - - json_url = mediaplayer_div_attributes["data-file"] - video_streams = self._download_json(json_url, video_id) - - - # get any working playlist from streams. Currently there's 2 and the first always works, - # but you never know in the future - for stream in video_streams: - formats = self._extract_m3u8_formats(stream.get("url"), video_id, ext="mp4", fatal=False) - if formats: - break - - # better support radio streams - if title.startswith("VOV"): - for f in formats: - f["ext"] = "m4a" - f["vcodec"] = "none" - - if "/video/" in url or "/radio/" in url: - is_live = False - elif "/kenh-truyen-hinh/" in url: - is_live = True - else: - is_live = None - - if is_live: - title = self._live_title(title) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'is_live': is_live, - } From 0645be49cb06f54135a9b92556207c1c468853ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 01:41:23 +0700 Subject: [PATCH 243/488] [inc] PEP 8 --- youtube_dl/extractor/inc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py index 8dee143ca..d5b258a0f 100644 --- a/youtube_dl/extractor/inc.py +++ b/youtube_dl/extractor/inc.py @@ -52,7 +52,7 @@ class IncIE(InfoExtractor): kaltura_id = self._search_regex( r'id=(["\'])kaltura_player_(?P.+?)\1', webpage, 'kaltura id', default=None, group='id') or self._parse_json(self._search_regex( - r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), + r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), display_id)['vid_kaltura_id'] return self.url_result( From e51752754d0dfe3e6597634c0c0f65508c55bcb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 01:50:43 +0700 Subject: [PATCH 244/488] [tvnet] Improve video id extraction --- youtube_dl/extractor/tvnet.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py index 0ec2da4da..1083d2730 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/youtube_dl/extractor/tvnet.py @@ -12,7 +12,7 @@ from ..utils import ( class TVNetIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P[0-9]+)/' _TESTS = [{ # video 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', @@ -36,6 +36,18 @@ class TVNetIE(InfoExtractor): 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', 'is_live': False, }, + }, { + 'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705', + 'info_dict': { + 'id': '129999', + 'ext': 'mp4', + 'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, }, { # live stream 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', @@ -62,6 +74,9 @@ class TVNetIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh', + 'only_matching': True, }] def _real_extract(self, url): From 7dc9c60b4b848f4b1d38b2995c7d421c89be93c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 02:05:58 +0700 Subject: [PATCH 245/488] [tvnet] Fix _VALID_URL --- youtube_dl/extractor/tvnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py index 1083d2730..2b2630b91 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/youtube_dl/extractor/tvnet.py @@ -12,7 +12,7 @@ from ..utils import ( class TVNetIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P[0-9]+)/' + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P\d+)(?:/|$)' _TESTS = [{ # video 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', From dc53c7863481f927bb011ad28784facec428ce22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 02:06:30 +0700 Subject: [PATCH 246/488] [crackle] Add support for sonycrackle.com (closes #16698) --- youtube_dl/extractor/crackle.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index fc014f8b5..f4a616455 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -19,8 +19,8 @@ from ..utils import ( class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' - _TEST = { + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' + _TESTS = [{ # geo restricted to CA 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { @@ -45,7 +45,10 @@ class CrackleIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 5d6c81b63f08f1ea0c2d01579c99fe7c8f0aa6e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 03:12:29 +0700 Subject: [PATCH 247/488] [downloader/http] Fix resume when writing ot stdout (closes #16699) --- youtube_dl/downloader/http.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index a22875f69..5b1e96013 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -217,10 +217,11 @@ class HttpFD(FileDownloader): before = start # start measuring def retry(e): - if ctx.tmpfilename != '-': + to_stdout = ctx.tmpfilename == '-' + if not to_stdout: ctx.stream.close() ctx.stream = None - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) raise RetryDownload(e) while True: From e0671819e71744b6a9a32fc3f4d5fbc8aca8a8f1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Jun 2018 13:07:20 +0100 Subject: [PATCH 248/488] [abc] fix ABC IView extraction and add support for livestreams(closes #16704)(closes #12354) --- youtube_dl/extractor/abc.py | 42 +++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 512f04684..266d76481 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -105,22 +105,22 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' - _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', + 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZY9247A021S00', + 'id': 'ZX9371A050S00', 'ext': 'mp4', - 'title': "Gaston's Visit", + 'title': "Gaston's Birthday", 'series': "Ben And Holly's Little Kingdom", - 'description': 'md5:18db170ad71cf161e006a4c688e33155', - 'upload_date': '20180318', + 'description': 'md5:f9de914d02f226968f598ac76f105bcf', + 'upload_date': '20180604', 'uploader_id': 'abc4kids', - 'timestamp': 1521400959, + 'timestamp': 1528140219, }, 'params': { 'skip_download': True, @@ -129,17 +129,16 @@ class ABCIViewIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_params = self._parse_json(self._search_regex( - r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) - title = video_params.get('title') or video_params['seriesTitle'] - stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + video_params = self._download_json( + 'https://iview.abc.net.au/api/programs/' + video_id, video_id) + title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) + stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) - house_number = video_params.get('episodeHouseNumber') - path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + house_number = video_params.get('episodeHouseNumber') or video_id + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( int(time.time()), house_number) sig = hmac.new( - 'android.content.res.Resources'.encode('utf-8'), + b'android.content.res.Resources', path.encode('utf-8'), hashlib.sha256).hexdigest() token = self._download_webpage( 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) @@ -169,18 +168,21 @@ class ABCIViewIE(InfoExtractor): 'ext': 'vtt', }] + is_live = video_params.get('livestream') == '1' + if is_live: + title = self._live_title(title) + return { 'id': video_id, - 'title': unescapeHTML(title), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), - 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'title': title, + 'description': video_params.get('description'), + 'thumbnail': video_params.get('thumbnail'), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], - 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), - 'episode': self._html_search_meta('episode_title', webpage, default=None), 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } From 9aca7fe6a3551df6379079f570e5a8bdf517c670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 20:25:50 +0700 Subject: [PATCH 249/488] [abc:iview] Extract more series metadata --- youtube_dl/extractor/abc.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 266d76481..4ac323bf6 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -181,6 +181,11 @@ class ABCIViewIE(InfoExtractor): 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], + 'season_number': int_or_none(self._search_regex( + r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), + 'episode_number': int_or_none(self._search_regex( + r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), + 'episode_id': house_number, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, From f15f7a674b309eff00a66d16449f8d5abb1c6682 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Jun 2018 14:46:00 +0100 Subject: [PATCH 250/488] [dailymotion] add support for password protected videos(closes #9789) --- youtube_dl/extractor/dailymotion.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index de27fffd4..0afb6a158 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,9 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json +import base64 +import hashlib import itertools +import json +import random +import re +import string +import struct from .common import InfoExtractor @@ -64,7 +69,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, - 'view_count': int, }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -167,6 +171,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + if metadata.get('error', {}).get('type') == 'password_protected': + password = self._downloader.params.get('videopassword') + if password: + r = int(metadata['id'][1:], 36) + us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') + t = ''.join(random.choice(string.ascii_letters) for i in range(10)) + n = us64e(struct.pack('I', r)) + i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) + metadata = self._download_json( + 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) + self._check_error(metadata) formats = [] @@ -302,8 +317,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): error = info.get('error') - if info.get('error') is not None: - title = error['title'] + if error: + title = error.get('title') or error['message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': self.raise_geo_restricted(msg=title) From 18d66f04107b584c2d6ee6c175c44c7f2d81ecba Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Jun 2018 15:12:42 +0100 Subject: [PATCH 251/488] [dailymotion] use compat_struct_pack --- youtube_dl/extractor/dailymotion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 0afb6a158..9a74906cb 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -8,10 +8,9 @@ import json import random import re import string -import struct from .common import InfoExtractor - +from ..compat import compat_struct_pack from ..utils import ( determine_ext, error_to_compat_str, @@ -177,7 +176,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): r = int(metadata['id'][1:], 36) us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') t = ''.join(random.choice(string.ascii_letters) for i in range(10)) - n = us64e(struct.pack('I', r)) + n = us64e(compat_struct_pack('I', r)) i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) metadata = self._download_json( 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) From aa56061627f9871b4793414b71a26976befd3a9c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Jun 2018 16:46:59 +0100 Subject: [PATCH 252/488] [discoverynetworks] Add support for disco-api videos(closes #16724) --- youtube_dl/extractor/discoverynetworks.py | 19 ++- youtube_dl/extractor/dplay.py | 137 +++++++++++----------- 2 files changed, 87 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index b6653784c..fba1ef221 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE +from .dplay import DPlayIE from ..compat import ( compat_parse_qs, compat_urlparse, @@ -12,8 +12,13 @@ from ..compat import ( from ..utils import smuggle_url -class DiscoveryNetworksDeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P\d+)|(?:[^/]+/)*videos/(?P[^/?#]+))' +class DiscoveryNetworksDeIE(DPlayIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ + (?: + .*\#(?P<id>\d+)| + (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| + programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) + )''' _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -40,6 +45,14 @@ class DiscoveryNetworksDeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + alternate_id = mobj.group('alternate_id') + if alternate_id: + self._initialize_geo_bypass({ + 'countries': ['DE'], + }) + return self._get_disco_api_info( + url, '%s/%s' % (mobj.group('programme'), alternate_id), + 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') brightcove_id = mobj.group('id') if not brightcove_id: title = mobj.group('title') diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 8e0374320..fe47f6dce 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -97,6 +97,75 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _get_disco_api_info(self, url, display_id, disco_host, realm): + disco_base = 'https://' + disco_host + token = self._download_json( + '%s/token' % disco_base, display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + headers = { + 'Referer': url, + 'Authorization': 'Bearer ' + token, + } + video = self._download_json( + '%s/content/videos/%s' % (disco_base, display_id), display_id, + headers=headers, query={ + 'include': 'show' + }) + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'] + formats = [] + for format_id, format_dict in self._download_json( + '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), + display_id, headers=headers)['data']['attributes']['streaming'].items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + series = None + try: + included = video.get('included') + if isinstance(included, list): + show = next(e for e in included if e.get('type') == 'show') + series = try_get( + show, lambda x: x['attributes']['name'], compat_str) + except StopIteration: + pass + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description'), + 'duration': float_or_none( + info.get('videoDuration'), scale=1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') @@ -113,72 +182,8 @@ class DPlayIE(InfoExtractor): if not video_id: host = mobj.group('host') - disco_base = 'https://disco-api.%s' % host - self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', - query={ - 'realm': host.replace('.', ''), - }) - video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, - headers={ - 'Referer': url, - 'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1', - }, query={ - 'include': 'show' - }) - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'] - formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id)['data']['attributes']['streaming'].items(): - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - } + return self._get_disco_api_info( + url, display_id, 'disco-api.' + host, host.replace('.', '')) info = self._download_json( 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), From 03eef0f03259bfcae284a56d00035950dd54f316 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Jun 2018 01:22:42 +0700 Subject: [PATCH 253/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ChangeLog b/ChangeLog index b808ad6ba..11812b195 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +version <unreleased> + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + version 2018.06.11 Extractors From c797db4a2fb7c8e41485bac74fe7f78295bab556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Jun 2018 01:24:53 +0700 Subject: [PATCH 254/488] release 2018.06.14 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d5be8003b..1cfb54bfd 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.11** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.11 +[debug] youtube-dl version 2018.06.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 11812b195..062000594 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.14 Core * [downloader/http] Fix retry on error when streaming to stdout (#16699) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e1a9f2236..705279ac1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -893,6 +893,7 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** + - **TVNet** - **TVNoe** - **TVNow** - **TVNowList** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e72f42cf2..1533dceb4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.11' +__version__ = '2018.06.14' From 61cb66830f5097b528aab381eb6b343a89f73cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Jun 2018 22:40:30 +0700 Subject: [PATCH 255/488] [bilibili] Restrict cid regex (closes #16638, closes #16734) --- youtube_dl/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 3e3348ef5..4d6b051fe 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -114,7 +114,7 @@ class BiliBiliIE(InfoExtractor): if 'anime/' not in url: cid = self._search_regex( - r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', From 9b0b62753432244fd062d99cc5dc604d6bad7877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Jun 2018 02:59:15 +0700 Subject: [PATCH 256/488] [downloader/rtmp] Fix downloading in verbose mode (closes #16736) --- youtube_dl/downloader/rtmp.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 9e0ddbb18..fbb7f51b0 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -24,13 +24,12 @@ class RtmpFD(FileDownloader): def real_download(self, filename, info_dict): def run_rtmpdump(args): start = time.time() + resume_percent = None + resume_downloaded_data_len = None proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True - - def dl(): - resume_percent = None - resume_downloaded_data_len = None - proc_stderr_closed = False + proc_stderr_closed = False + try: while not proc_stderr_closed: # read line from stderr line = '' @@ -90,12 +89,8 @@ class RtmpFD(FileDownloader): self.to_screen('') cursor_in_new_line = True self.to_screen('[rtmpdump] ' + line) - - try: - dl() finally: proc.wait() - if not cursor_in_new_line: self.to_screen('') return proc.returncode From 87f89dacddfa46399aea9252ca078f5f386dce38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 16 Jun 2018 02:55:20 +0700 Subject: [PATCH 257/488] [pbs] Improve extraction (closes #16623, closes #16684) --- youtube_dl/extractor/pbs.py | 57 ++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8d6f2dd3d..52ab2f158 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -375,6 +376,35 @@ class PBSIE(InfoExtractor): }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -438,6 +468,7 @@ class PBSIE(InfoExtractor): r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -472,7 +503,8 @@ class PBSIE(InfoExtractor): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -482,13 +514,27 @@ class PBSIE(InfoExtractor): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'<div\s+id="video_([0-9]+)"', player_page, 'video ID') + r'<div\s+id=["\']video_(\d+)', player_page, 'video ID', + default=None) + if not video_id: + video_info = self._extract_video_data( + player_page, 'video data', display_id) + video_id = compat_str( + video_info.get('id') or video_info['contentID']) else: video_id = mobj.group('id') display_id = video_id return video_id, display_id, None, description + def _extract_video_data(self, string, name, video_id, fatal=True): + return self._parse_json( + self._search_regex( + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + r'window\.videoBridge\s*=\s*({.+?});'], + string, name, default='{}'), + video_id, transform_source=js_to_json, fatal=fatal) + def _real_extract(self, url): video_id, display_id, upload_date, description = self._extract_webpage(url) @@ -519,11 +565,8 @@ class PBSIE(InfoExtractor): 'http://player.pbs.org/%s/%s' % (page, video_id), display_id, 'Downloading %s page' % page, fatal=False) if player: - video_info = self._parse_json( - self._search_regex( - [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], - player, '%s video data' % page, default='{}'), - display_id, transform_source=js_to_json, fatal=False) + video_info = self._extract_video_data( + player, '%s video data' % page, display_id, fatal=False) if video_info: extract_redirect_urls(video_info) if not info: From 81c5df4f2ce33cff5d47b9ee29edf08b50998a53 Mon Sep 17 00:00:00 2001 From: Urgau <lolo.branstett@numericable.fr> Date: Sat, 16 Jun 2018 00:08:44 +0200 Subject: [PATCH 258/488] [vidzi] Fix extraction (closes #16678) --- youtube_dl/extractor/vidzi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9026e778c..d70283479 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -54,7 +54,8 @@ class VidziIE(InfoExtractor): self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=js_to_json) + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) if jwplayer_data: break From 734d461ca04a9f271dd463aa75d44ac82377057e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 16 Jun 2018 21:14:36 +0700 Subject: [PATCH 259/488] [expressen] Add extractor --- youtube_dl/extractor/expressen.py | 77 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 78 insertions(+) create mode 100644 youtube_dl/extractor/expressen.py diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py new file mode 100644 index 000000000..f61178012 --- /dev/null +++ b/youtube_dl/extractor/expressen.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, + unified_timestamp, +) + + +class ExpressenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', + 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'info_dict': { + 'id': '8690962', + 'ext': 'mp4', + 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', + 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 788, + 'timestamp': 1526639109, + 'upload_date': '20180518', + }, + }, { + 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + def extract_data(name): + return self._parse_json( + self._search_regex( + r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name, + webpage, 'info', group='value'), + display_id, transform_source=unescapeHTML) + + info = extract_data('video-tracking-info') + video_id = info['videoId'] + + data = extract_data('article-data') + stream = data['stream'] + + if determine_ext(stream) == 'm3u8': + formats = self._extract_m3u8_formats( + stream, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{ + 'url': stream, + }] + self._sort_formats(formats) + + title = info.get('titleRaw') or data['title'] + description = info.get('descriptionRaw') + thumbnail = info.get('socialMediaImage') or data.get('image') + duration = int_or_none(info.get('videoTotalSecondsDuration') or + data.get('totalSecondsDuration')) + timestamp = unified_timestamp(info.get('publishDate')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d4583b8e4..c3e6daa24 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -335,6 +335,7 @@ from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE +from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE from .facebook import ( From 764cd4e6f3450997eb0499b68b17b580a5e074f3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 02:43:24 +0100 Subject: [PATCH 260/488] [rtbf] improve extraction - add support for audio and live streams(closes #11923)(closes #9638) - extract HLS, DASH and all HTTP formats - extract subtitles - fixup specific http urls(fixes #16101) --- youtube_dl/extractor/rtbf.py | 127 ++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 28cc5522d..acff9766a 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - int_or_none, ExtractorError, + float_or_none, + int_or_none, + strip_or_none, ) @@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor): (?: video/[^?]+\?.*\bid=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*id= + auvio/[^/]+\?.*\b(?P<live>l)?id= )(?P<id>\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', + 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', - 'duration': 3099, + 'description': '(du 25/04/2014)', + 'duration': 3099.54, 'upload_date': '20140425', - 'timestamp': 1398456336, - 'uploader': 'rtbfsport', + 'timestamp': 1398456300, } }, { # geo restricted @@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor): }, { 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, }] _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _PROVIDERS = { @@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) error = data.get('error') if error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - data = data['data'] - provider = data.get('provider') if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' formats = [] - for key, format_id in self._QUALITIES: - format_url = data.get(key + 'Url') - if format_url: + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats.copy(): + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': fix_url(format_url), + 'height': height, }) - thumbnails = [] - for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): - if thumbnail_id != 'default': - thumbnails.append({ - 'url': self._IMAGE_HOST + thumbnail_url, - 'id': thumbnail_id, - }) + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) return { - 'id': video_id, + 'id': media_id, 'formats': formats, - 'title': data['title'], - 'description': data.get('description') or data.get('subtitle'), - 'thumbnails': thumbnails, - 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': int_or_none(data.get('created')), - 'view_count': int_or_none(data.get('viewCount')), - 'uploader': data.get('channel'), - 'tags': data.get('tags'), + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, } From 18825117545690499dc7064cd5ba207ca5ca3e23 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 12:01:14 +0100 Subject: [PATCH 261/488] [6play] add support for rtlplay.be and extract hd usp formats --- youtube_dl/extractor/sixplay.py | 43 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 69951e387..1f8469a90 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -19,29 +19,33 @@ from ..utils import ( class SixPlayIE(InfoExtractor): IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450', - 'md5': '42310bffe4ba3982db112b9cd3467328', + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay.be)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', 'info_dict': { - 'id': '11638450', + 'id': '12041051', 'ext': 'mp4', - 'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6', - 'description': 'md5:308853f6a5f9e2d55a30fc0654de415f', - 'duration': 39, - 'series': 'Le meilleur pâtissier', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', }, - 'params': { - 'skip_download': True, - }, - } + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + }.get(domain, ('6play', 'm6web')) data = self._download_json( - 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id, - video_id, query={ + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ 'csa': 5, 'with': 'clips', }) @@ -65,7 +69,12 @@ class SixPlayIE(InfoExtractor): subtitles.setdefault('fr', []).append({'url': asset_url}) continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage(asset_url, video_id, fatal=False) + if not urlh: + continue + asset_url = urlh.geturl() asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', From 8b183bd5f800792cfc37da8ef2383fb5ba88195c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 15:53:29 +0100 Subject: [PATCH 262/488] [tf1] try all supported adaptive urls --- youtube_dl/extractor/tf1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e595c4a69..903f47380 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -19,6 +19,7 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { From 0adf213d8cce21e1a6ca6be7df532d67d184fbe2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 15:56:52 +0100 Subject: [PATCH 263/488] [wat] try all supported adaptive urls --- youtube_dl/extractor/wat.py | 41 +++++++++++++++---------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 20fef1f04..8ef3e0906 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -19,7 +19,6 @@ class WatIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': '83d882d9de5c9d97f0bb2c6273cde56a', 'info_dict': { 'id': '11713067', 'ext': 'mp4', @@ -28,10 +27,15 @@ class WatIE(InfoExtractor): 'upload_date': '20140819', 'duration': 120, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', 'info_dict': { 'id': '11713075', 'ext': 'mp4', @@ -98,38 +102,25 @@ class WatIE(InfoExtractor): formats = [] try: + alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id) m3u8_url = manifest_urls.get('hls') if m3u8_url: m3u8_url = remove_bitrate_limit(m3u8_url) - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + for m3u8_alt_url in alt_urls(m3u8_url): + formats.extend(self._extract_m3u8_formats( + m3u8_alt_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - http_url = extract_url('android5/%s.mp4', 'http') - if http_url: - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - format_id = m3u8_format['format_id'].replace('hls', 'http') - fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) - if self._is_valid_url(fmt_url, video_id, format_id): - f = m3u8_format.copy() - f.update({ - 'url': fmt_url, - 'format_id': format_id, - 'protocol': 'http', - }) - formats.append(f) mpd_url = manifest_urls.get('mpd') if mpd_url: - formats.extend(self._extract_mpd_formats(remove_bitrate_limit( - mpd_url), video_id, mpd_id='dash', fatal=False)) + mpd_url = remove_bitrate_limit(mpd_url) + for mpd_alt_url in alt_urls(mpd_url): + formats.extend(self._extract_mpd_formats( + mpd_alt_url, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) except ExtractorError: abr = 64 From ce0edda0f9c0d8cf6250edfa7a43ddbccd101cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 00:49:50 +0700 Subject: [PATCH 264/488] [markiza] Add extractors (closes #16750) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/markiza.py | 121 +++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 youtube_dl/extractor/markiza.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c3e6daa24..3b3964c01 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -590,6 +590,10 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py new file mode 100644 index 000000000..e6bfab114 --- /dev/null +++ b/youtube_dl/extractor/markiza.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + orderedSet, + parse_duration, + try_get, +) + + +class MarkizaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P<id>\d+)(?:[_/]|$)' + _TESTS = [{ + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139078', + 'ext': 'mp4', + 'title': 'Oteckovia 109', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2760, + }, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny', + 'info_dict': { + 'id': '85430', + 'title': 'Televízne noviny', + }, + 'playlist_count': 23, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/embed/85295', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://videoarchiv.markiza.sk/json/video_jwplayer7.json', + video_id, query={'id': video_id}) + + info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash') + + if info.get('_type') == 'playlist': + info.update({ + 'id': video_id, + 'title': try_get( + data, lambda x: x['details']['name'], compat_str), + }) + else: + info['duration'] = parse_duration( + try_get(data, lambda x: x['details']['duration'], compat_str)) + return info + + +class MarkizaPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P<id>\d+)_' + _TESTS = [{ + 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139355', + 'ext': 'mp4', + 'title': 'Oteckovia 110', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2604, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas', + 'only_matching': True, + }, { + 'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach', + 'only_matching': True, + }, { + 'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili', + 'only_matching': True, + }, { + 'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba', + 'only_matching': True, + }, { + 'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) + for video_id in orderedSet(re.findall( + r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)', + webpage))] + + return self.playlist_result(entries, playlist_id) From 9e761fe6f555a3ad0b92bdc2c651a4c5b8aff887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 01:31:49 +0700 Subject: [PATCH 265/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 062000594..38cfcd8fd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version <unreleased> + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + version 2018.06.14 Core From 858cf4dc2966d398d939cedffc160afad2484f8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 01:34:36 +0700 Subject: [PATCH 266/488] release 2018.06.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1cfb54bfd..de3888214 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.14 +[debug] youtube-dl version 2018.06.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 38cfcd8fd..fe5087097 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.18 Core * [downloader/rtmp] Fix downloading in verbose mode (#16736) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 705279ac1..432a7ba93 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -266,6 +266,7 @@ - **Europa** - **EveryonesMixtape** - **ExpoTV** + - **Expressen** - **ExtremeTube** - **EyedoTV** - **facebook** @@ -455,6 +456,8 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **Markiza** + - **MarkizaPage** - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1533dceb4..49fef60ea 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.14' +__version__ = '2018.06.18' From 8ba84e4600229c9baec6410b0c0c9e500c0105b5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 20:40:35 +0100 Subject: [PATCH 267/488] [tvnow] try all clear manifest urls(closes #15361) --- youtube_dl/extractor/tvnow.py | 53 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 808571ece..60937616f 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -19,8 +19,8 @@ class TVNowBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -31,27 +31,42 @@ class TVNowBaseIE(InfoExtractor): video_id = compat_str(info['id']) title = info['title'] - mpd_url = info['manifest']['dashclear'] - if not mpd_url: + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: if info.get('isDrm'): raise ExtractorError( 'Video %s is DRM protected' % video_id, expected=True) if info.get('geoblocked'): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) + raise self.raise_geo_restricted() if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - - mpd_url = update_url_query(mpd_url, {'filter': ''}) - formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) description = info.get('articleLong') or info.get('articleShort') @@ -88,7 +103,7 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ (?P<show_id>[^/]+)/ (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) ''' @@ -140,11 +155,13 @@ class TVNowIE(TVNowBaseIE): }] def _real_extract(self, url): - display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + display_id = '%s/%s' % mobj.group(2, 3) info = self._call_api( 'movies/' + display_id, display_id, query={ 'fields': ','.join(self._VIDEO_FIELDS), + 'station': mobj.group(1), }) return self._extract_video(info, display_id) From 075a13d3e9e860f0033ea5a37795bebba02690b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 03:22:08 +0700 Subject: [PATCH 268/488] [compat] Introduce compat_integer_types --- youtube_dl/compat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4a611f183..7b770340f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2787,6 +2787,12 @@ except NameError: # Python 3 compat_numeric_types = (int, float, complex) +try: + compat_integer_types = (int, long) +except NameError: # Python 3 + compat_integer_types = (int, ) + + if sys.version_info < (2, 7): def compat_socket_create_connection(address, timeout, source_address=None): host, port = address @@ -2974,6 +2980,7 @@ __all__ = [ 'compat_http_client', 'compat_http_server', 'compat_input', + 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', 'compat_numeric_types', From d391b7e23d3d6c2af03c6329b4bf059ec095f33d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 04:01:48 +0700 Subject: [PATCH 269/488] [extractor/common] Introduce expected_status for convenient accept of failed HTTP requests Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response. --- youtube_dl/extractor/common.py | 135 +++++++++++++++++++++++++++------ 1 file changed, 113 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a2548dba3..394f34372 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -19,6 +19,7 @@ from ..compat import ( compat_cookies, compat_etree_fromstring, compat_getpass, + compat_integer_types, compat_http_client, compat_os_name, compat_str, @@ -548,8 +549,26 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - """ Returns the response handle """ + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ if note is None: self.report_download_webpage(video_id) elif note is not False: @@ -578,6 +597,10 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + return err.fp + if errnote is False: return False if errnote is None: @@ -590,13 +613,17 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): - """ Returns a tuple (page content as string, URL handle) """ + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) if urlh is False: assert not fatal return False @@ -685,13 +712,52 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): - """ Returns the data of the page as a string """ + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=5, encoding=None, data=None, + headers={}, query={}, expected_status=None): + """ + Return the data of the page as a string. + + Arguments: + url_or_request -- plain text URL as a string or + a compat_urllib_request.Requestobject + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + tries -- number of tries + timeout -- sleep interval between tries + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -707,11 +773,17 @@ class InfoExtractor(object): def _download_xml_handle( self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res xml_string, urlh = res @@ -719,15 +791,21 @@ class InfoExtractor(object): xml_string, video_id, transform_source=transform_source, fatal=fatal), urlh - def _download_xml(self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}): - """Return the xml as an xml.etree.ElementTree.Element""" + def _download_xml( + self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}, expected_status=None): + """ + Return the xml as an xml.etree.ElementTree.Element. + + See _download_webpage docstring for arguments specification. + """ res = self._download_xml_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): @@ -745,11 +823,17 @@ class InfoExtractor(object): def _download_json_handle( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (JSON object, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res json_string, urlh = res @@ -760,11 +844,18 @@ class InfoExtractor(object): def _download_json( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ res = self._download_json_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): From 00a429bea3c2deacef5dbfb2b0b7e191b1dbaf62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 04:04:13 +0700 Subject: [PATCH 270/488] [markiza] Expect 500 status code --- youtube_dl/extractor/markiza.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py index e6bfab114..def960a0c 100644 --- a/youtube_dl/extractor/markiza.py +++ b/youtube_dl/extractor/markiza.py @@ -110,7 +110,11 @@ class MarkizaPageIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage( + # Downloading for some hosts (e.g. dajto, doma) fails with 500 + # although everything seems to be OK, so considering 500 + # status code to be expected. + url, playlist_id, expected_status=500) entries = [ self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) From 9283d4ea03f907f2b9e7954b0897075a165b4d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 04:04:47 +0700 Subject: [PATCH 271/488] [bbccouk] Use expected_status --- youtube_dl/extractor/bbc.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 30a63a24e..293d82b0f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -21,7 +21,6 @@ from ..utils import ( urljoin, ) from ..compat import ( - compat_etree_fromstring, compat_HTTPError, compat_urlparse, ) @@ -334,14 +333,9 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - try: - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): - media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) - else: - raise + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): From 721a877d2fb82de18e4aeec27d70f84f9b41f766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 23:08:35 +0700 Subject: [PATCH 272/488] [vgtv] Add support for www.aftonbladet.se/tv/ URLs --- youtube_dl/extractor/vgtv.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index c21a09c01..d430e2944 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -24,6 +24,7 @@ class VGTVIE(XstreamIE): 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -44,7 +45,7 @@ class VGTVIE(XstreamIE): (?: (?:\#!/)?(?:video|live)/| embed?.*id=| - articles/ + a(?:rticles)?/ )| (?P<appname> %s @@ -143,6 +144,10 @@ class VGTVIE(XstreamIE): 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, { 'url': 'abtv:140026', 'only_matching': True, From 713afa705c228c2caa6054fff19a7690ba19d64a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 23:15:38 +0700 Subject: [PATCH 273/488] [vgtv] Improve HLS formats extraction --- youtube_dl/extractor/vgtv.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index d430e2944..fe7a26b62 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -183,13 +183,15 @@ class VGTVIE(XstreamIE): streams = data['streamUrls'] stream_type = data.get('streamType') - + is_live = stream_type == 'live' formats = [] hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -234,13 +236,13 @@ class VGTVIE(XstreamIE): info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], + 'title': self._live_title(data['title']) if is_live else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'is_live': True if stream_type == 'live' else False, + 'is_live': is_live, }) return info From 18806e3b6b95d03c773c89e465e1b28b2f12a618 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 18 Jun 2018 19:08:54 +0100 Subject: [PATCH 274/488] [rtbf] fix extraction for python 3.2 and older --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index acff9766a..3b0f3080b 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -99,7 +99,7 @@ class RTBFIE(InfoExtractor): http_url = data.get('url') if formats and http_url and re.search(height_re, http_url): http_url = fix_url(http_url) - for m3u8_f in formats.copy(): + for m3u8_f in formats[:]: height = m3u8_f.get('height') if not height: continue From e12b4b8bccd364cb5cc68aab4888209965a82dc1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 19 Jun 2018 10:35:42 +0100 Subject: [PATCH 275/488] [6play] use geo verfication headers --- youtube_dl/extractor/sixplay.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 1f8469a90..a363221bc 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -71,7 +71,9 @@ class SixPlayIE(InfoExtractor): if container == 'm3u8' or ext == 'm3u8': if protocol == 'usp': if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: - urlh = self._request_webpage(asset_url, video_id, fatal=False) + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) if not urlh: continue asset_url = urlh.geturl() From 8b4b400aef83b233502ece7321ee84f6ab9e213e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:00:36 +0700 Subject: [PATCH 276/488] [peertube] Improve generic support (closes #16733) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/peertube.py | 47 +++++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dad951b75..6c0f772ac 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3076,7 +3076,7 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - peertube_urls = PeerTubeIE._extract_urls(webpage) + peertube_urls = PeerTubeIE._extract_urls(webpage, url) if peertube_urls: return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index a481b3151..d9849a2ba 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -116,12 +116,14 @@ class PeerTubeIE(InfoExtractor): videos\.tcit\.fr| peertube\.cpy\.re )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _VALID_URL = r'''(?x) - https?:// - %s - /(?:videos/(?:watch|embed)|api/v\d/videos)/ - (?P<id>[^/?\#&]+) - ''' % _INSTANCES_RE + (?: + peertube:(?P<host>[^:]+):| + https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P<id>%s) + ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', @@ -157,21 +159,40 @@ class PeerTubeIE(InfoExtractor): }, { 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, }] @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'''(?x)<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' - % PeerTubeIE._INSTANCES_RE, webpage)] + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P<host>[^/]+)/videos/watch/(?P<id>%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + '<title>PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') video = self._download_json( - urljoin(url, '/api/v1/videos/%s' % video_id), video_id) + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) title = video['name'] From e73050882763705ccb8e487edbc3983b5582b1a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:12:53 +0700 Subject: [PATCH 277/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index fe5087097..1494081b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + version 2018.06.18 Core From c9b983ff827aae25a0fe2116c98c26702c581b81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:16:04 +0700 Subject: [PATCH 278/488] release 2018.06.19 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index de3888214..d254678b5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.19*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.19** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.18 +[debug] youtube-dl version 2018.06.19 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 1494081b8..93dc40d8c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.19 Core + [extractor/common] Introduce expected_status in _download_* methods diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 49fef60ea..dd4795cd1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.18' +__version__ = '2018.06.19' From f51f526b0acb5943332452d1958581cb1135bfe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 20 Jun 2018 23:51:14 +0700 Subject: [PATCH 279/488] [foxnews] Add support for iframe embeds (closes #15810, closes #16711) --- youtube_dl/extractor/foxnews.py | 42 +++++++++++++++++++++++++++------ youtube_dl/extractor/generic.py | 6 +++++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index dc0662f74..4c402806a 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -58,6 +58,14 @@ class FoxNewsIE(AMPIE): }, ] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', + webpage)] + def _real_extract(self, url): host, video_id = re.match(self._VALID_URL, url).groups() @@ -71,18 +79,35 @@ class FoxNewsArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' - _TEST = { + _TESTS = [{ + # data-video-id 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'md5': '83d44e1aff1433e7a29a7b537d1700b5', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', 'description': 'Veterans react on \'The Kelly File\'', - 'timestamp': 1473299755, + 'timestamp': 1473301045, 'upload_date': '20160908', }, - } + }, { + # iframe embed + 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'info_dict': { + 'id': '5748266721001', + 'ext': 'flv', + 'title': 'Kyle Kashuv has a positive message for the Trump White House', + 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 229, + 'timestamp': 1520594670, + 'upload_date': '20180309', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -90,10 +115,13 @@ class FoxNewsArticleIE(InfoExtractor): video_id = self._html_search_regex( r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', - webpage, 'video ID', group='id') + webpage, 'video ID', group='id', default=None) + if video_id: + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) + return self.url_result( - 'http://video.foxnews.com/v/' + video_id, - FoxNewsIE.ie_key()) + FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) class FoxNewsInsiderIE(InfoExtractor): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6c0f772ac..d71cb9050 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -111,6 +111,7 @@ from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE from .apa import APAIE +from .foxnews import FoxNewsIE class GenericIE(InfoExtractor): @@ -3091,6 +3092,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( apa_urls, video_id, video_title, ie=APAIE.ie_key()) + foxnews_urls = FoxNewsIE._extract_urls(webpage) + if foxnews_urls: + return self.playlist_from_matches( + foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From 91aa502d916fd3f103d34f927748767413f1d1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 20 Jun 2018 23:59:37 +0700 Subject: [PATCH 280/488] [foxnews:insider] Remove extractor (#15810) Now covered by foxnews:article --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/foxnews.py | 49 +++--------------------------- 2 files changed, 4 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3b3964c01..27ece3b53 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -373,7 +373,6 @@ from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, FoxNewsArticleIE, - FoxNewsInsiderIE, ) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 4c402806a..63613cb85 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -76,7 +76,7 @@ class FoxNewsIE(AMPIE): class FoxNewsArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' _TESTS = [{ @@ -107,6 +107,9 @@ class FoxNewsArticleIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'only_matching': True, }] def _real_extract(self, url): @@ -122,47 +125,3 @@ class FoxNewsArticleIE(InfoExtractor): return self.url_result( FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) - - -class FoxNewsInsiderIE(InfoExtractor): - _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)' - IE_NAME = 'foxnews:insider' - - _TEST = { - 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', - 'md5': 'a10c755e582d28120c62749b4feb4c0c', - 'info_dict': { - 'id': '5099377331001', - 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', - 'ext': 'mp4', - 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', - 'description': 'Is campus censorship getting out of control?', - 'timestamp': 1472168725, - 'upload_date': '20160825', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': [FoxNewsIE.ie_key()], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - - return { - '_type': 'url_transparent', - 'ie_key': FoxNewsIE.ie_key(), - 'url': embed_url, - 'display_id': display_id, - 'title': title, - 'description': description, - } From 30374f4d40d8c993bf92c5af9b9c073da49fe8b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:06:58 +0700 Subject: [PATCH 281/488] [itv] Make SOAP request non fatal and extract metadata from a webpage (closes #16780) --- youtube_dl/extractor/itv.py | 126 ++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 6a4f8a505..40cffed46 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -18,6 +18,7 @@ from ..utils import ( xpath_element, xpath_text, int_or_none, + merge_dicts, parse_duration, smuggle_url, ExtractorError, @@ -129,64 +130,65 @@ class ITVIE(InfoExtractor): resp_env = self._download_xml( params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env)) - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) - else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), - }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + headers=headers, data=etree.tostring(req_env), fatal=False) + if resp_env: + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_code = xpath_text(resp_env, './/faultcode') + fault_string = xpath_text(resp_env, './/faultstring') + if fault_code == 'InvalidGeoRegion': + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') @@ -261,7 +263,17 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, }) - return info + + webpage_info = self._search_json_ld(webpage, video_id, default={}) + if not webpage_info.get('title'): + webpage_info['title'] = self._html_search_regex( + r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or webpage_info['episode'] + + return merge_dicts(info, webpage_info) class ITVBTCCIE(InfoExtractor): From a4ec45179e554e9b24e32c3c06908804b42a5a9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:12:40 +0700 Subject: [PATCH 282/488] [itv] Sort imports --- youtube_dl/extractor/itv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 40cffed46..d05a7b68d 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -13,16 +13,16 @@ from ..compat import ( compat_etree_register_namespace, ) from ..utils import ( + determine_ext, + ExtractorError, extract_attributes, - xpath_with_ns, - xpath_element, - xpath_text, int_or_none, merge_dicts, parse_duration, smuggle_url, - ExtractorError, - determine_ext, + xpath_with_ns, + xpath_element, + xpath_text, ) From b71cc719103c45365244334a4c481f88cd3534fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:38:32 +0700 Subject: [PATCH 283/488] [motherless] Fix extraction (closes #16786) --- youtube_dl/extractor/motherless.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index e24396e79..f191310e1 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -77,8 +77,11 @@ class MotherlessIE(InfoExtractor): title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - video_url = self._html_search_regex( - r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') + video_url = (self._html_search_regex( + (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), + webpage, 'video URL', default=None, group='url') or + 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( r'<strong>Views</strong>\s+([^<]+)<', From 9fb62e35f6e7d865a73cc310f24ccfa0700e5e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:39:13 +0700 Subject: [PATCH 284/488] [motherless:group] Fix _VALID_URL --- youtube_dl/extractor/motherless.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index f191310e1..bed5645f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -123,7 +123,7 @@ class MotherlessIE(InfoExtractor): class MotherlessGroupIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://motherless.com/g/movie_scenes', 'info_dict': { From 74caf528bc822738dffe231df86ed399fc97a38a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 24 Jun 2018 12:02:16 +0100 Subject: [PATCH 285/488] [brightcove] workaround sonyliv DRM protected videos(closes #16807) --- youtube_dl/extractor/brightcove.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ab62e54d6..14f9a14ed 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -572,7 +572,8 @@ class BrightcoveNewIE(AdobePassIE): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism' or container == 'WVM': + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -629,6 +630,14 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) errors = json_data.get('errors') if not formats and errors: From a0949fec081d0badd6a584526cd66e8f170625c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jun 2018 23:57:22 +0700 Subject: [PATCH 286/488] [joj] Relax _VALID_URL (closes #16771) --- youtube_dl/extractor/joj.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index a764023e9..d9f8dbfd2 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -18,7 +18,7 @@ class JojIE(InfoExtractor): joj:| https?://media\.joj\.sk/embed/ ) - (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + (?P<id>[^/?#^]+) ''' _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', @@ -29,16 +29,24 @@ class JojIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3118, } + }, { + 'url': 'https://media.joj.sk/embed/9i1cxv', + 'only_matching': True, }, { 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', 'only_matching': True, + }, { + 'url': 'joj:9i1cxv', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - webpage) + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', + webpage)] def _real_extract(self, url): video_id = self._match_id(url) From c306f076ec81334b458f61b2a4ae683a9e732d06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jun 2018 02:17:14 +0700 Subject: [PATCH 287/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 93dc40d8c..327580328 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + version 2018.06.19 Core From 1f6cc5807ec69584664388b8edfaf6b3ae442cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jun 2018 02:26:02 +0700 Subject: [PATCH 288/488] release 2018.06.25 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d254678b5..128e6e681 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.19*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.19** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.19 +[debug] youtube-dl version 2018.06.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 327580328..8eb7469d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.25 Extractors * [joj] Relax URL regular expression (#16771) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 432a7ba93..a78fabb02 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -290,7 +290,6 @@ - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - - **foxnews:insider** - **FoxSports** - **france2.fr:generation-what** - **FranceCulture** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dd4795cd1..8fbafd6a1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.19' +__version__ = '2018.06.25' From c3bcd206eb031de30179c88ac7acd806a477ceae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jun 2018 00:01:06 +0700 Subject: [PATCH 289/488] [porncom] Fix extraction (closes #16808) --- youtube_dl/extractor/porncom.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 60ade06da..5726cab3a 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -43,7 +43,8 @@ class PornComIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), webpage, 'config', default='{}'), display_id, transform_source=js_to_json, fatal=False) @@ -69,7 +70,7 @@ class PornComIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), } for format_url, height, filesize in re.findall( - r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', + r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', webpage)] thumbnail = None duration = None From 7b393f9cc5dc4790bcb623c768fa4a3046ef80bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jun 2018 04:29:11 +0700 Subject: [PATCH 290/488] [svt] Improve extraction and add support for pages (closes #16802) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 11 --- youtube_dl/extractor/svt.py | 117 ++++++++++++++++++++++++----- 3 files changed, 98 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 27ece3b53..f2377521b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1040,6 +1040,7 @@ from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, + SVTPageIE, SVTPlayIE, SVTSeriesIE, ) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d71cb9050..aa04905ed 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1395,17 +1395,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # SVT embed - { - 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', - 'info_dict': { - 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', - 'duration': 27, - 'age_limit': 0, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index f71eab8b2..0901c3163 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -12,6 +12,8 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + orderedSet, + strip_or_none, try_get, urljoin, compat_str, @@ -137,7 +139,12 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + (?: + svt:(?P<svt_id>[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -164,10 +171,40 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/kanaler/svt1', 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, }] + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, svt_id = mobj.group('id', 'svt_id') + + if svt_id: + return self._extract_by_video_id(svt_id) webpage = self._download_webpage(url, video_id) @@ -179,10 +216,6 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) - def adjust_title(info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -193,24 +226,14 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - adjust_title(info_dict) + self._adjust_title(info_dict) return info_dict - video_id = self._search_regex( + svt_id = self._search_regex( r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id', default=None) + webpage, 'video id') - if video_id: - data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, - video_id, headers=self.geo_verification_headers()) - info_dict = self._extract_video(data, video_id) - if not info_dict.get('title'): - info_dict['title'] = re.sub( - r'\s*\|\s*.+?$', '', - info_dict.get('episode') or self._og_search_title(webpage)) - adjust_title(info_dict) - return info_dict + return self._extract_by_video_id(svt_id, webpage) class SVTSeriesIE(SVTPlayBaseIE): @@ -292,3 +315,57 @@ class SVTSeriesIE(SVTPlayBaseIE): return self.playlist_result( entries, series_id, title, metadata.get('description')) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'info_dict': { + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'info_dict': { + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + }, + 'playlist_count': 1, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] + + title = strip_or_none(self._og_search_title(webpage, default=None)) + + return self.playlist_result(entries, playlist_id, title) From acbd0ff5df5ff9d69e6707ea4fa3e3b4f9cc6528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 00:35:05 +0700 Subject: [PATCH 291/488] [dctptv] Restore extraction based on REST API (closes #16850) --- youtube_dl/extractor/dctp.py | 82 ++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 3a6d0560e..dc0c41b8a 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -5,13 +5,15 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, - unified_strdate, + int_or_none, + unified_timestamp, ) class DctpTvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ + # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', @@ -19,31 +21,49 @@ class DctpTvIE(InfoExtractor): 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', - 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 71.24, + 'timestamp': 1302172322, + 'upload_date': '20110407', }, 'params': { # rtmp download 'skip_download': True, }, - } + }, { + # 16x9 + 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', + 'only_matching': True, + }] + + _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com' def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + version = self._download_json( + '%s/version.json' % self._BASE_URL, display_id, + 'Downloading version JSON') - video_id = self._html_search_meta( - 'DC.identifier', webpage, 'video id', - default=None) or self._search_regex( - r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') + restapi_base = '%s/%s/restapi' % ( + self._BASE_URL, version['version_name']) - title = self._og_search_title(webpage) + info = self._download_json( + '%s/slugs/%s.json' % (restapi_base, display_id), display_id, + 'Downloading video info JSON') + + media = self._download_json( + '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])), + display_id, 'Downloading media JSON') + + uuid = media['uuid'] + title = media['title'] + ratio = '16x9' if media.get('is_wide') else '4x3' + play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) servers = self._download_json( 'http://www.dctp.tv/streaming_servers/', display_id, - note='Downloading server list', fatal=False) + note='Downloading server list JSON', fatal=False) if servers: endpoint = next( @@ -60,27 +80,35 @@ class DctpTvIE(InfoExtractor): formats = [{ 'url': endpoint, 'app': app, - 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'play_path': play_path, 'page_url': url, - 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', 'ext': 'flv', }] - description = self._html_search_meta('DC.description', webpage) - upload_date = unified_strdate( - self._html_search_meta('DC.date.created', webpage)) - thumbnail = self._og_search_thumbnail(webpage) - duration = float_or_none(self._search_regex( - r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', - default=None), scale=1000) + thumbnails = [] + images = media.get('images') + if isinstance(images, list): + for image in images: + if not isinstance(image, dict): + continue + image_url = image.get('url') + if not image_url or not isinstance(image_url, compat_str): + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': uuid, 'display_id': display_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'duration': duration, + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description') or media.get('teaser'), + 'timestamp': unified_timestamp(media.get('created')), + 'duration': float_or_none(media.get('duration_in_ms'), scale=1000), + 'thumbnails': thumbnails, + 'formats': formats, } From d4a24f4091a622b808ff621e78b5cfd0db3c8c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 01:09:14 +0700 Subject: [PATCH 292/488] Prefer ffmpeg over avconv by default (closes #8622) --- youtube_dl/YoutubeDL.py | 4 ++-- youtube_dl/options.py | 4 ++-- youtube_dl/postprocessor/ffmpeg.py | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a405c5ca..38ba43a97 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -305,8 +305,8 @@ class YoutubeDL(object): http_chunk_size. The following options are used by the post processors: - prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, - otherwise prefer avconv. + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. postprocessor_args: A list of additional command-line arguments for the postprocessor. diff --git a/youtube_dl/options.py b/youtube_dl/options.py index e83d546a0..e7d8e8910 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -841,11 +841,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors (default)') + help='Prefer avconv over ffmpeg for running the postprocessors') postproc.add_option( '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors') + help='Prefer ffmpeg over avconv for running the postprocessors (default)') postproc.add_option( '--ffmpeg-location', '--avconv-location', metavar='PATH', dest='ffmpeg_location', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3ea1afcf3..757b496a1 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -77,7 +77,7 @@ class FFmpegPostProcessor(PostProcessor): def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = False + prefer_ffmpeg = True self.basename = None self.probe_basename = None @@ -85,7 +85,7 @@ class FFmpegPostProcessor(PostProcessor): self._paths = None self._versions = None if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) location = self._downloader.params.get('ffmpeg_location') if location is not None: if not os.path.exists(location): @@ -117,19 +117,19 @@ class FFmpegPostProcessor(PostProcessor): (p, get_exe_version(p, args=['-version'])) for p in programs) self._paths = dict((p, p) for p in programs) - if prefer_ffmpeg: - prefs = ('ffmpeg', 'avconv') - else: + if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') for p in prefs: if self._versions[p]: self.basename = p break - if prefer_ffmpeg: - prefs = ('ffprobe', 'avprobe') - else: + if prefer_ffmpeg is False: prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') for p in prefs: if self._versions[p]: self.probe_basename = p From 5e8e2fa51f416e227367211ab937dfea17f89f57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 01:25:05 +0700 Subject: [PATCH 293/488] [extractor/common] Use source URL as Referer for HTML5 entries (closes #16849) --- youtube_dl/extractor/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 394f34372..f3fec160d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2437,6 +2437,8 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) + for f in media_info['formats']: + f.setdefault('http_headers', {})['Referer'] = base_url if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries From 9cf648c92bcc131db7d7fad673864bba06121482 Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Mon, 18 Jun 2018 11:50:06 +0200 Subject: [PATCH 294/488] [mediaset] Add support for new videos --- youtube_dl/extractor/mediaset.py | 60 ++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9760eafd5..76a2ae125 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -10,6 +10,7 @@ from ..utils import ( parse_duration, try_get, unified_strdate, + ExtractorError ) @@ -42,6 +43,22 @@ class MediasetIE(InfoExtractor): 'categories': ['reality'], }, 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', + 'md5': '1276f966ac423d16ba255ce867de073e', + 'info_dict': { + 'id': '846685', + 'ext': 'mp4', + 'title': 'Puntata del 25 maggio', + 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6565, + 'creator': 'mediaset', + 'upload_date': '20180525', + 'series': 'Matrix', + 'categories': ['infotainment'], + }, + 'expected_warnings': ['is not a supported codec'], }, { # clip 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', @@ -70,18 +87,29 @@ class MediasetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + media_info = self._download_json( + 'https://www.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading media info', query={ + 'id': video_id + })['video'] + + media_id = try_get(media_info, lambda x: x['guid']) or video_id + video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn.aspx', + 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', video_id, 'Downloading video CDN JSON', query={ - 'streamid': video_id, + 'streamid': media_id, 'format': 'json', })['videoList'] formats = [] for format_url in video_list: if '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) + try: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + except ExtractorError: + pass else: formats.append({ 'url': format_url, @@ -89,30 +117,24 @@ class MediasetIE(InfoExtractor): }) self._sort_formats(formats) - mediainfo = self._download_json( - 'http://plr.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading video info JSON', query={ - 'id': video_id, - })['video'] - - title = mediainfo['title'] + title = media_info['title'] creator = try_get( - mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + media_info, lambda x: x['brand-info']['publisher'], compat_str) category = try_get( - mediainfo, lambda x: x['brand-info']['category'], compat_str) + media_info, lambda x: x['brand-info']['category'], compat_str) categories = [category] if category else None return { 'id': video_id, 'title': title, - 'description': mediainfo.get('short-description'), - 'thumbnail': mediainfo.get('thumbnail'), - 'duration': parse_duration(mediainfo.get('duration')), + 'description': media_info.get('short-description'), + 'thumbnail': media_info.get('thumbnail'), + 'duration': parse_duration(media_info.get('duration')), 'creator': creator, - 'upload_date': unified_strdate(mediainfo.get('production-date')), - 'webpage_url': mediainfo.get('url'), - 'series': mediainfo.get('brand-value'), + 'upload_date': unified_strdate(media_info.get('production-date')), + 'webpage_url': media_info.get('url'), + 'series': media_info.get('brand-value'), 'categories': categories, 'formats': formats, } From 267d81962a0709f15f82f96b7aadbb5473a06992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 Jun 2018 02:16:44 +0700 Subject: [PATCH 295/488] [mediaset] Fix issues and extract all formats (closes #16568) --- youtube_dl/extractor/mediaset.py | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 76a2ae125..9f2b60dcc 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -10,7 +10,6 @@ from ..utils import ( parse_duration, try_get, unified_strdate, - ExtractorError ) @@ -58,7 +57,7 @@ class MediasetIE(InfoExtractor): 'series': 'Matrix', 'categories': ['infotainment'], }, - 'expected_warnings': ['is not a supported codec'], + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', @@ -87,13 +86,14 @@ class MediasetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - media_info = self._download_json( + video = self._download_json( 'https://www.video.mediaset.it/html/metainfo.sjson', video_id, 'Downloading media info', query={ 'id': video_id })['video'] - media_id = try_get(media_info, lambda x: x['guid']) or video_id + title = video['title'] + media_id = video.get('guid') or video_id video_list = self._download_json( 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', @@ -104,12 +104,17 @@ class MediasetIE(InfoExtractor): formats = [] for format_url in video_list: - if '.ism' in format_url: - try: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - except ExtractorError: - pass + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'ism' or '.ism' in format_url: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) else: formats.append({ 'url': format_url, @@ -117,24 +122,23 @@ class MediasetIE(InfoExtractor): }) self._sort_formats(formats) - title = media_info['title'] - creator = try_get( - media_info, lambda x: x['brand-info']['publisher'], compat_str) + video, lambda x: x['brand-info']['publisher'], compat_str) category = try_get( - media_info, lambda x: x['brand-info']['category'], compat_str) + video, lambda x: x['brand-info']['category'], compat_str) categories = [category] if category else None return { 'id': video_id, 'title': title, - 'description': media_info.get('short-description'), - 'thumbnail': media_info.get('thumbnail'), - 'duration': parse_duration(media_info.get('duration')), + 'description': video.get('short-description'), + 'thumbnail': video.get('thumbnail'), + 'duration': parse_duration(video.get('duration')), 'creator': creator, - 'upload_date': unified_strdate(media_info.get('production-date')), - 'webpage_url': media_info.get('url'), - 'series': media_info.get('brand-value'), + 'upload_date': unified_strdate(video.get('production-date')), + 'webpage_url': video.get('url'), + 'series': video.get('brand-value'), + 'season': video.get('season'), 'categories': categories, 'formats': formats, } From 2160768a215849e82a167912cb8f0aa054e87d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 Jun 2018 23:39:56 +0700 Subject: [PATCH 296/488] [npo] Fix typo (closes #16872) --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index cb8319f0d..c2cb85a73 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -282,7 +282,7 @@ class NPOIE(NPOBaseIE): video_url = stream_info.get('url') if not video_url or video_url in urls: continue - urls.add(item_url) + urls.add(video_url) if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, ext='mp4', From eca1f0d115e6a2712ff0d5f6b25e3ded5e52db71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Jul 2018 02:00:16 +0700 Subject: [PATCH 297/488] [extractor/common] Properly escape % in MPD templates (closes #16867) --- youtube_dl/extractor/common.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f3fec160d..78f053f18 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2106,7 +2106,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) From 973b6ceebbf0c79086cbf3203a8a8c79daf0b1ba Mon Sep 17 00:00:00 2001 From: coreynicholson <coreynicholson@users.noreply.github.com> Date: Sun, 1 Jul 2018 15:19:17 +0100 Subject: [PATCH 298/488] [vlive] Fix live streams extraction --- youtube_dl/extractor/vlive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 64d0224e6..0b5165fd0 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_FIELD = 'video params' @@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor): def _live(self, video_id, webpage): init_page = self._download_webpage( - 'http://www.vlive.tv/video/init/view', + 'https://www.vlive.tv/video/init/view', video_id, note='Downloading live webpage', data=urlencode_postdata({'videoSeq': video_id}), headers={ - 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, 'Content-Type': 'application/x-www-form-urlencoded' }) From 8cee692b8b66322e4c1a0d37baceb9e4c49a3f8e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 1 Jul 2018 22:32:59 +0100 Subject: [PATCH 299/488] [go90] detect geo restriction error and pass geo verification headers(closes #16874) --- youtube_dl/extractor/go90.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 35dde42d0..6f8c56a93 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -28,14 +29,27 @@ class Go90IE(InfoExtractor): 'age_limit': 14, } } + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://www.go90.com/api/view/items/' + video_id, - video_id, headers={ + + try: + headers = self.geo_verification_headers() + headers.update({ 'Content-Type': 'application/json; charset=utf-8', - }, data=b'{"client":"web","device_type":"pc"}') + }) + video_data = self._download_json( + 'https://www.go90.com/api/view/items/' + video_id, video_id, + headers=headers, data=b'{"client":"web","device_type":"pc"}') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + message = self._parse_json(e.cause.read().decode(), None)['error']['message'] + if 'region unavailable' in message: + self.raise_geo_restricted(countries=['US']) + raise ExtractorError(message, expected=True) + raise + if video_data.get('requires_drm'): raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] From db5debf313bd2ab99016f2c5b389dbf9ffae3dfb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 1 Jul 2018 22:41:11 +0100 Subject: [PATCH 300/488] [go90] add support for embed urls(closes #16873) --- youtube_dl/extractor/go90.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 6f8c56a93..c3ea717bc 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -15,8 +15,8 @@ from ..utils import ( class Go90IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'https://www.go90.com/videos/84BUqjLpf9D', 'md5': 'efa7670dbbbf21a7b07b360652b24a32', 'info_dict': { @@ -28,7 +28,10 @@ class Go90IE(InfoExtractor): 'upload_date': '20170411', 'age_limit': 14, } - } + }, { + 'url': 'https://www.go90.com/embed/261MflWkD3N', + 'only_matching': True, + }] _GEO_BYPASS = False def _real_extract(self, url): From 5621c3222eaab29d5ca705c8dac2d0bc1eb785d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Jul 2018 02:47:09 +0700 Subject: [PATCH 301/488] [lynda] Simplify login and improve error capturing (#16891) --- youtube_dl/extractor/lynda.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index f5c7abc13..1316cddb6 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -44,21 +44,15 @@ class LyndaBaseIE(InfoExtractor): form_data = self._hidden_inputs(form_html) form_data.update(extra_form_data) - try: - response = self._download_json( - action_url, None, note, - data=urlencode_postdata(form_data), - headers={ - 'Referer': referrer_url, - 'X-Requested-With': 'XMLHttpRequest', - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: - response = self._parse_json(e.cause.read().decode('utf-8'), None) - self._check_error(response, ('email', 'password')) - raise + response = self._download_json( + action_url, None, note, + data=urlencode_postdata(form_data), + headers={ + 'Referer': referrer_url, + 'X-Requested-With': 'XMLHttpRequest', + }, expected_status=(418, 500, )) - self._check_error(response, 'ErrorMessage') + self._check_error(response, ('email', 'password', 'ErrorMessage')) return response, action_url From 836ef4840f00d7e07c826de23f3a09675dc532d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:48:40 +0700 Subject: [PATCH 302/488] [pluralsight] Switch to graphql (closes #16889, closes #16899) --- youtube_dl/extractor/pluralsight.py | 120 +++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index a207ca9cb..1257841e4 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -27,6 +27,60 @@ from ..utils import ( class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + def _download_course(self, course_id, url, display_id): try: return self._download_course_rpc(course_id, url, display_id) @@ -39,20 +93,14 @@ class PluralsightBaseIE(InfoExtractor): def _download_course_rpc(self, course_id, url, display_id): response = self._download_json( - '%s/player/functions/rpc' % self._API_BASE, display_id, - 'Downloading course JSON', - data=json.dumps({ - 'fn': 'bootstrapPlayer', - 'payload': { - 'courseId': course_id, - }, - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) - course = try_get(response, lambda x: x['payload']['course'], dict) + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) if course: return course @@ -90,6 +138,28 @@ class PluralsightIE(PluralsightBaseIE): 'only_matching': True, }] + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + def _real_initialize(self): self._login() @@ -277,7 +347,7 @@ class PluralsightIE(PluralsightBaseIE): f = QUALITIES[quality].copy() clip_post = { 'author': author, - 'includeCaptions': False, + 'includeCaptions': 'false', 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', @@ -286,11 +356,23 @@ class PluralsightIE(PluralsightBaseIE): 'quality': '%dx%d' % (f['width'], f['height']), } format_id = '%s-%s' % (ext, quality) - viewclip = self._download_json( - '%s/video/clips/viewclip' % self._API_BASE, display_id, - 'Downloading %s viewclip JSON' % format_id, fatal=False, - data=json.dumps(clip_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see From 24d26ab3808619b209e6a22c505a46eaa781c614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:49:03 +0700 Subject: [PATCH 303/488] [lynda] PEP 8 --- youtube_dl/extractor/lynda.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 1316cddb6..4ba61cd8a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urlparse, ) From d5de0f21b97bb26c8c558e33b40202a6e02f7b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:57:17 +0700 Subject: [PATCH 304/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 8eb7469d4..b1045d7d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version <unreleased> + +Core +* [extractor/common] Properly escape % in MPD templates (#16867) +* [extractor/common] Use source URL as Referer for HTML5 entries (16849) +* Prefer ffmpeg over avconv by default (#8622) + +Extractors +* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) +* [lynda] Simplify login and improve error capturing (#16891) ++ [go90] Add support for embed URLs (#16873) +* [go90] Detect geo restriction error and pass geo verification headers + (#16874) +* [vlive] Fix live streams extraction (#16871) +* [npo] Fix typo (#16872) ++ [mediaset] Add support for new videos and extract all formats (#16568) +* [dctptv] Restore extraction based on REST API (#16850) +* [svt] Improve extraction and add support for pages (#16802) +* [porncom] Fix extraction (#16808) + + version 2018.06.25 Extractors From 689af4960e16c63c1f26933095d2f8c8b76f119f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Jul 2018 04:59:21 +0700 Subject: [PATCH 305/488] release 2018.07.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 4 ++-- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 128e6e681..453983f84 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.25** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.25 +[debug] youtube-dl version 2018.07.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index b1045d7d4..c33bf7777 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.04 Core * [extractor/common] Properly escape % in MPD templates (#16867) diff --git a/README.md b/README.md index 499a0c206..09e62899a 100644 --- a/README.md +++ b/README.md @@ -427,9 +427,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors (default) - --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running the + postprocessors (default) --ffmpeg-location PATH Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a78fabb02..19dc984dc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -813,6 +813,7 @@ - **StretchInternet** - **SunPorno** - **SVT** + - **SVTPage** - **SVTPlay**: SVT Play and Öppet arkiv - **SVTSeries** - **SWRMediathek** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8fbafd6a1..4cf97291b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.25' +__version__ = '2018.07.04' From 9a6628aaf918afdcdf4c661f474e318207155780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 00:36:35 +0700 Subject: [PATCH 306/488] [youtube] Improve login error handling (closes #13822) --- youtube_dl/extractor/youtube.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 89c8b7f8d..117a57911 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,13 +178,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): warn('Unable to extract result entry') return False - tfa = try_get(res, lambda x: x[0][0], list) - if tfa: - tfa_str = try_get(tfa, lambda x: x[2], compat_str) - if tfa_str == 'TWO_STEP_VERIFICATION': + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': # SEND_SUCCESS - TFA code has been successfully sent to phone # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(tfa, lambda x: x[5], compat_str) + status = try_get(login_challenge, lambda x: x[5], compat_str) if status == 'QUOTA_EXCEEDED': warn('Exceeded the limit of TFA codes, try later') return False @@ -228,6 +228,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): check_cookie_url = try_get( tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) + return False else: check_cookie_url = try_get(res, lambda x: x[2], compat_str) From 94fef94d9c8571db255b0f3694617f5cd56825b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 02:14:06 +0700 Subject: [PATCH 307/488] [dplayit] Fix extraction (closes #16901) --- youtube_dl/extractor/dplay.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index fe47f6dce..a95e3213c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -21,6 +21,7 @@ from ..utils import ( unified_strdate, unified_timestamp, update_url_query, + urljoin, USER_AGENTS, ) @@ -310,9 +311,11 @@ class DPlayItIE(InfoExtractor): if not info: info_url = self._search_regex( - r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', - webpage, 'info url') + (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'), + webpage, 'info url', group='url') + info_url = urljoin(url, info_url) video_id = info_url.rpartition('/')[-1] try: @@ -322,6 +325,8 @@ class DPlayItIE(InfoExtractor): 'dplayit_token').value, 'Referer': url, }) + if isinstance(info, compat_str): + info = self._parse_json(info, display_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): info = self._parse_json(e.cause.read().decode('utf-8'), display_id) From e15141adaee67d1e9b9e97c17b73fe4c052c3449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 02:14:50 +0700 Subject: [PATCH 308/488] [dplayit] Sort formats --- youtube_dl/extractor/dplay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a95e3213c..ebf59512c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -342,6 +342,7 @@ class DPlayItIE(InfoExtractor): formats = self._extract_m3u8_formats( hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) series = self._html_search_regex( r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', From 1ed0b2f74d8337b9625716f9069233a341edd22e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Jul 2018 02:22:15 +0700 Subject: [PATCH 309/488] [watchbox] Fix extraction (closes #16904) --- youtube_dl/extractor/watchbox.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index be0bcba15..d99313080 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -67,11 +67,12 @@ class WatchBoxIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - source = self._parse_json( + source = (self._parse_json( self._search_regex( - r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', + r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', default='{}'), - video_id, transform_source=js_to_json, fatal=False) or {} + video_id, transform_source=js_to_json, + fatal=False) or {}).get('source') or {} video_id = compat_str(source.get('videoId') or video_id) From 4e71dfd819ccef91a056edc5bf6ca8cec9f2ad4f Mon Sep 17 00:00:00 2001 From: Aaron Brager <getaaron@gmail.com> Date: Thu, 5 Jul 2018 10:17:18 -0500 Subject: [PATCH 310/488] [README.md] Rename OS X to macOS --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 09e62899a..6d49d6a4f 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms # INSTALLATION -To install it right away for all UNIX users (Linux, OS X, etc.), type: +To install it right away for all UNIX users (Linux, macOS, etc.), type: sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl @@ -35,7 +35,7 @@ You can also use pip: This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. -OS X users can install youtube-dl with [Homebrew](https://brew.sh/): +macOS users can install youtube-dl with [Homebrew](https://brew.sh/): brew install youtube-dl @@ -442,7 +442,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` From 47421507883850dc679dc23eb44a615f18282bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Jul 2018 23:49:36 +0700 Subject: [PATCH 311/488] [funk] Fix extraction (closes #16918) --- youtube_dl/extractor/funk.py | 64 ++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index 0ff058619..76c20ffac 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from .nexx import NexxIE +from ..compat import compat_str from ..utils import ( int_or_none, try_get, @@ -12,6 +13,19 @@ from ..utils import ( class FunkBaseIE(InfoExtractor): + _HEADERS = { + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4', + } + _AUTH = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4' + + @staticmethod + def _make_headers(referer): + headers = FunkBaseIE._HEADERS.copy() + headers['Referer'] = referer + return headers + def _make_url_result(self, video): return { '_type': 'url_transparent', @@ -48,19 +62,19 @@ class FunkMixIE(FunkBaseIE): lists = self._download_json( 'https://www.funk.net/api/v3.1/curation/curatedLists/', - mix_id, headers={ - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI', - 'Referer': url, - }, query={ + mix_id, headers=self._make_headers(url), query={ 'size': 100, - })['result']['lists'] + })['_embedded']['curatedListList'] metas = next( l for l in lists if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] video = next( meta['videoDataDelegate'] - for meta in metas if meta.get('alias') == alias) + for meta in metas + if try_get( + meta, lambda x: x['videoDataDelegate']['alias'], + compat_str) == alias) return self._make_url_result(video) @@ -104,25 +118,39 @@ class FunkChannelIE(FunkBaseIE): channel_id = mobj.group('id') alias = mobj.group('alias') - headers = { - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', - 'Referer': url, - } + headers = self._make_headers(url) video = None - by_id_list = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, - headers=headers, query={ - 'ids': alias, + # Id-based channels are currently broken on their side: webplayer + # tries to process them via byChannelAlias endpoint and fails + # predictably. + by_channel_alias = self._download_json( + 'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s' + % channel_id, + 'Downloading byChannelAlias JSON', headers=headers, query={ + 'size': 100, }, fatal=False) - if by_id_list: - video = try_get(by_id_list, lambda x: x['result'][0], dict) + if by_channel_alias: + video_list = try_get( + by_channel_alias, lambda x: x['_embedded']['videoList'], list) + if video_list: + video = next(r for r in video_list if r.get('alias') == alias) + + if not video: + by_id_list = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/byIdList', + channel_id, 'Downloading byIdList JSON', headers=headers, + query={ + 'ids': alias, + }, fatal=False) + if by_id_list: + video = try_get(by_id_list, lambda x: x['result'][0], dict) if not video: results = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, - headers=headers, query={ + 'https://www.funk.net/api/v3.0/content/videos/filter', + channel_id, 'Downloading filter JSON', headers=headers, query={ 'channelId': channel_id, 'size': 100, })['result'] From 6868d272e5c4a85ba4143bacbb7269dac099c55d Mon Sep 17 00:00:00 2001 From: Luca Cherubin <luca.cherubin@gmail.com> Date: Thu, 26 Apr 2018 20:33:09 +0100 Subject: [PATCH 312/488] [frontendmasters] Add extractor --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/frontendmaster.py | 271 +++++++++++++++++++++++++ 2 files changed, 275 insertions(+) create mode 100644 youtube_dl/extractor/frontendmaster.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f2377521b..265b4aa9d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -390,6 +390,10 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE +from .frontendmaster import ( + FrontEndMasterIE, + FrontEndMasterCourseIE +) from .funimation import FunimationIE from .funk import ( FunkMixIE, diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py new file mode 100644 index 000000000..21e382da9 --- /dev/null +++ b/youtube_dl/extractor/frontendmaster.py @@ -0,0 +1,271 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import collections +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse) +from ..utils import ( + ExtractorError, + urlencode_postdata, + qualities, unescapeHTML) + + +class FrontEndMasterBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' + _VIDEO_BASE = 'http://www.frontendmasters.com/courses' + _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts' + _COOKIES_BASE = 'https://api.frontendmasters.com' + _LOGIN_URL = 'https://frontendmasters.com/login/' + + _QUALITIES_PREFERENCE = ('low', 'medium', 'high') + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'medium': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + AllowedQuality = collections.namedtuple('AllowedQuality', + ['ext', 'qualities']) + _ALLOWED_QUALITIES = [ + AllowedQuality('webm', ['low', 'medium', 'high']), + AllowedQuality('mp4', ['low', 'medium', 'high']) + ] + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'} + ) + + error = self._search_regex( + r'<div[^>]+class=["\']Message MessageAlert["\'][^>]*>' + r'([^<]+)' + r'</div>', + response, 'error message', default=None) + + if error: + raise ExtractorError('Unable to login: %s' % unescapeHTML(error), + expected=True) + + def _download_course(self, course_id, url): + response = self._download_json( + '%s/%s' % (self._API_BASE, course_id), course_id, + 'Downloading course JSON', + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + return response + + @staticmethod + def _pair_section_video_element(lesson_elements): + sections = {} + current_section = None + current_section_number = 0 + for elem in lesson_elements: + if not isinstance(elem, int): + elem_name = elem + if not isinstance(elem_name, str): + # convert unicode to str + elem_name = elem.encode('utf-8') + (current_section, current_section_number) = \ + (elem_name, current_section_number + 1) + else: + if current_section: + sections[elem] = (current_section, current_section_number) + + return sections + + +class FrontEndMasterIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ + r'(?P<courseid>[a-z\-]+)/' \ + r'(?P<id>[a-z\-]+)' + + _NETRC_MACHINE = 'frontendmasters' + + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', + 'info_dict': { + 'id': 'tools', + 'title': 'Tools', + 'display_id': 'tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', + 'ext': 'mp4' + }, + 'skip': 'Requires FrontendMasters account credentials', + } + + def _get_subtitles(self, video_hash, video_id): + captions = self._download_webpage( + '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id, + fatal=False) + if captions: + return { + 'en': [{ + 'ext': 'vtt', + 'data': captions + }] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + course_id = mobj.group('courseid') + + course_json_content = self._download_course(course_id=course_id, + url=url) + + # Necessary to get mandatory informations like title and video_url + lesson_index = course_json_content.get('lessonSlugs').index(video_id) + lesson_hash = course_json_content.get('lessonHashes')[lesson_index] + lesson_data = course_json_content.get('lessonData')[lesson_hash] + # This is necessary to get the link for the video + lesson_source_base = lesson_data['sourceBase'] + + lesson_title = lesson_data['title'] + + # Some optional fields + lesson_description = lesson_data.get('description') + lesson_index = lesson_data.get('index') + lesson_slug = lesson_data.get('slug') + lesson_thumbnail_url = lesson_data.get('thumbnail') + lesson_section_elements = course_json_content.get('lessonElements') + + try: + course_sections_pairing = self._pair_section_video_element( + lesson_section_elements) + + lesson_section = \ + course_sections_pairing.get(lesson_index)[0] + + lesson_section_number = \ + course_sections_pairing.get(lesson_index)[1] + except Exception: + lesson_section = None + lesson_section_number = None + + video_request_url = '%s/source' + video_request_headers = { + 'origin': 'https://frontendmasters.com', + 'referer': lesson_source_base, + } + + quality_key = qualities(self._QUALITIES_PREFERENCE) + + formats = [] + for ext, qualities_ in self._ALLOWED_QUALITIES: + for quality in qualities_: + f = self._QUALITIES[quality].copy() + video_request_params = { + 'r': f['height'], + 'f': ext + } + video_response = self._download_json( + video_request_url % lesson_source_base, video_id, + query=video_request_params, headers=video_request_headers) + + video_url = video_response.get('url') + clip_f = f.copy() + clip_f.update({ + 'url': video_url, + 'ext': ext, + 'format_id': '%s-%s' % (ext, quality), + 'quality': quality_key(quality), + 'height': f['height'] + }) + formats.append(clip_f) + + self._sort_formats(formats) + + subtitles = self.extract_subtitles(lesson_hash, video_id) + + return { + 'id': video_id, + 'display_id': lesson_slug, + 'title': lesson_title, + 'description': lesson_description, + 'chapter': lesson_section, + 'chapter_number': lesson_section_number, + 'thumbnail': lesson_thumbnail_url, + 'formats': formats, + 'subtitles': subtitles + } + + +class FrontEndMasterCourseIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters:course' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<courseid>[a-z\-]+)/?$' + + _NETRC_MACHINE = 'frontendmasters' + + _TEST = { + 'url': 'https://frontendmasters.com/courses/javascript-basics/', + 'info_dict': { + 'id': 'javascript-basics', + 'title': 'Introduction to JavaScript Programming', + 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' + }, + 'playlist_count': 19, + 'skip': 'Requires FrontendMasters account credentials' + } + + @classmethod + def suitable(cls, url): + return False if FrontEndMasterIE.suitable(url) else super(FrontEndMasterBaseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('courseid') + course_json_content = self._download_course(course_id=course_id, + url=url) + + title = course_json_content.get('title') + description = course_json_content.get('description') + course_display_id = course_json_content.get('slug') + + videos_data = course_json_content.get('lessonData').values() + videos_data = sorted(videos_data, key=lambda video: video.get('index')) + + entries = [] + for video in videos_data: + video_slug = video.get('slug') + clip_url = '%s/%s/%s' % ( + self._VIDEO_BASE, course_display_id, video_slug) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': FrontEndMasterIE.ie_key() + }) + + return self.playlist_result(entries, course_id, title, description) From 69fcdb845b9744125161f514cb4166becbae2959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Jul 2018 00:48:23 +0700 Subject: [PATCH 313/488] [frontendmasters] Fix issues and improve extraction (closes #3661, closes #16328) --- youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/frontendmaster.py | 271 ------------------------ youtube_dl/extractor/frontendmasters.py | 262 +++++++++++++++++++++++ 3 files changed, 266 insertions(+), 274 deletions(-) delete mode 100644 youtube_dl/extractor/frontendmaster.py create mode 100644 youtube_dl/extractor/frontendmasters.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 265b4aa9d..ed532d77f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -390,9 +390,10 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE -from .frontendmaster import ( - FrontEndMasterIE, - FrontEndMasterCourseIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE ) from .funimation import FunimationIE from .funk import ( diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py deleted file mode 100644 index 21e382da9..000000000 --- a/youtube_dl/extractor/frontendmaster.py +++ /dev/null @@ -1,271 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import collections -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse) -from ..utils import ( - ExtractorError, - urlencode_postdata, - qualities, unescapeHTML) - - -class FrontEndMasterBaseIE(InfoExtractor): - _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' - _VIDEO_BASE = 'http://www.frontendmasters.com/courses' - _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts' - _COOKIES_BASE = 'https://api.frontendmasters.com' - _LOGIN_URL = 'https://frontendmasters.com/login/' - - _QUALITIES_PREFERENCE = ('low', 'medium', 'high') - _QUALITIES = { - 'low': {'width': 480, 'height': 360}, - 'medium': {'width': 1280, 'height': 720}, - 'high': {'width': 1920, 'height': 1080} - } - - AllowedQuality = collections.namedtuple('AllowedQuality', - ['ext', 'qualities']) - _ALLOWED_QUALITIES = [ - AllowedQuality('webm', ['low', 'medium', 'high']), - AllowedQuality('mp4', ['low', 'medium', 'high']) - ] - - def _real_initialize(self): - self._login() - - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post_url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'} - ) - - error = self._search_regex( - r'<div[^>]+class=["\']Message MessageAlert["\'][^>]*>' - r'([^<]+)' - r'</div>', - response, 'error message', default=None) - - if error: - raise ExtractorError('Unable to login: %s' % unescapeHTML(error), - expected=True) - - def _download_course(self, course_id, url): - response = self._download_json( - '%s/%s' % (self._API_BASE, course_id), course_id, - 'Downloading course JSON', - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) - return response - - @staticmethod - def _pair_section_video_element(lesson_elements): - sections = {} - current_section = None - current_section_number = 0 - for elem in lesson_elements: - if not isinstance(elem, int): - elem_name = elem - if not isinstance(elem_name, str): - # convert unicode to str - elem_name = elem.encode('utf-8') - (current_section, current_section_number) = \ - (elem_name, current_section_number + 1) - else: - if current_section: - sections[elem] = (current_section, current_section_number) - - return sections - - -class FrontEndMasterIE(FrontEndMasterBaseIE): - IE_NAME = 'frontend-masters' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ - r'(?P<courseid>[a-z\-]+)/' \ - r'(?P<id>[a-z\-]+)' - - _NETRC_MACHINE = 'frontendmasters' - - _TEST = { - 'url': 'https://frontendmasters.com/courses/web-development/tools', - 'md5': '7f161159710d6b7016a4f4af6fcb05e2', - 'info_dict': { - 'id': 'tools', - 'title': 'Tools', - 'display_id': 'tools', - 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', - 'ext': 'mp4' - }, - 'skip': 'Requires FrontendMasters account credentials', - } - - def _get_subtitles(self, video_hash, video_id): - captions = self._download_webpage( - '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id, - fatal=False) - if captions: - return { - 'en': [{ - 'ext': 'vtt', - 'data': captions - }] - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - course_id = mobj.group('courseid') - - course_json_content = self._download_course(course_id=course_id, - url=url) - - # Necessary to get mandatory informations like title and video_url - lesson_index = course_json_content.get('lessonSlugs').index(video_id) - lesson_hash = course_json_content.get('lessonHashes')[lesson_index] - lesson_data = course_json_content.get('lessonData')[lesson_hash] - # This is necessary to get the link for the video - lesson_source_base = lesson_data['sourceBase'] - - lesson_title = lesson_data['title'] - - # Some optional fields - lesson_description = lesson_data.get('description') - lesson_index = lesson_data.get('index') - lesson_slug = lesson_data.get('slug') - lesson_thumbnail_url = lesson_data.get('thumbnail') - lesson_section_elements = course_json_content.get('lessonElements') - - try: - course_sections_pairing = self._pair_section_video_element( - lesson_section_elements) - - lesson_section = \ - course_sections_pairing.get(lesson_index)[0] - - lesson_section_number = \ - course_sections_pairing.get(lesson_index)[1] - except Exception: - lesson_section = None - lesson_section_number = None - - video_request_url = '%s/source' - video_request_headers = { - 'origin': 'https://frontendmasters.com', - 'referer': lesson_source_base, - } - - quality_key = qualities(self._QUALITIES_PREFERENCE) - - formats = [] - for ext, qualities_ in self._ALLOWED_QUALITIES: - for quality in qualities_: - f = self._QUALITIES[quality].copy() - video_request_params = { - 'r': f['height'], - 'f': ext - } - video_response = self._download_json( - video_request_url % lesson_source_base, video_id, - query=video_request_params, headers=video_request_headers) - - video_url = video_response.get('url') - clip_f = f.copy() - clip_f.update({ - 'url': video_url, - 'ext': ext, - 'format_id': '%s-%s' % (ext, quality), - 'quality': quality_key(quality), - 'height': f['height'] - }) - formats.append(clip_f) - - self._sort_formats(formats) - - subtitles = self.extract_subtitles(lesson_hash, video_id) - - return { - 'id': video_id, - 'display_id': lesson_slug, - 'title': lesson_title, - 'description': lesson_description, - 'chapter': lesson_section, - 'chapter_number': lesson_section_number, - 'thumbnail': lesson_thumbnail_url, - 'formats': formats, - 'subtitles': subtitles - } - - -class FrontEndMasterCourseIE(FrontEndMasterBaseIE): - IE_NAME = 'frontend-masters:course' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<courseid>[a-z\-]+)/?$' - - _NETRC_MACHINE = 'frontendmasters' - - _TEST = { - 'url': 'https://frontendmasters.com/courses/javascript-basics/', - 'info_dict': { - 'id': 'javascript-basics', - 'title': 'Introduction to JavaScript Programming', - 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' - }, - 'playlist_count': 19, - 'skip': 'Requires FrontendMasters account credentials' - } - - @classmethod - def suitable(cls, url): - return False if FrontEndMasterIE.suitable(url) else super(FrontEndMasterBaseIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('courseid') - course_json_content = self._download_course(course_id=course_id, - url=url) - - title = course_json_content.get('title') - description = course_json_content.get('description') - course_display_id = course_json_content.get('slug') - - videos_data = course_json_content.get('lessonData').values() - videos_data = sorted(videos_data, key=lambda video: video.get('index')) - - entries = [] - for video in videos_data: - video_slug = video.get('slug') - clip_url = '%s/%s/%s' % ( - self._VIDEO_BASE, course_display_id, video_slug) - entries.append({ - '_type': 'url_transparent', - 'url': clip_url, - 'ie_key': FrontEndMasterIE.ie_key() - }) - - return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py new file mode 100644 index 000000000..770db46d0 --- /dev/null +++ b/youtube_dl/extractor/frontendmasters.py @@ -0,0 +1,262 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_duration, + urlencode_postdata, +) + + +class FrontendMastersBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki' + _LOGIN_URL = 'https://frontendmasters.com/login/' + + _NETRC_MACHINE = 'frontendmasters' + + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'mid': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + # Successful login + if any(p in response for p in ( + 'wp-login.php?action=logout', '>Logout')): + return + + error = self._html_search_regex( + r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<', + response, 'error message', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class FrontendMastersPageBaseIE(FrontendMastersBaseIE): + def _download_course(self, course_name, url): + return self._download_json( + '%s/courses/%s' % (self._API_BASE, course_name), course_name, + 'Downloading course JSON', headers={'Referer': url}) + + @staticmethod + def _extract_chapters(course): + chapters = [] + lesson_elements = course.get('lessonElements') + if isinstance(lesson_elements, list): + chapters = [e for e in lesson_elements if isinstance(e, compat_str)] + return chapters + + @staticmethod + def _extract_lesson(chapters, lesson_id, lesson): + title = lesson.get('title') or lesson_id + display_id = lesson.get('slug') + description = lesson.get('description') + thumbnail = lesson.get('thumbnail') + + chapter_number = None + index = lesson.get('index') + element_index = lesson.get('elementIndex') + if (isinstance(index, int) and isinstance(element_index, int) and + index < element_index): + chapter_number = element_index - index + chapter = (chapters[chapter_number - 1] + if chapter_number - 1 < len(chapters) else None) + + duration = None + timestamp = lesson.get('timestamp') + if isinstance(timestamp, compat_str): + mobj = re.search( + r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})', + timestamp) + if mobj: + duration = parse_duration(mobj.group('end')) - parse_duration( + mobj.group('start')) + + return { + '_type': 'url_transparent', + 'url': 'frontendmasters:%s' % lesson_id, + 'ie_key': FrontendMastersIE.ie_key(), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'chapter': chapter, + 'chapter_number': chapter_number, + } + + +class FrontendMastersIE(FrontendMastersBaseIE): + _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', + 'info_dict': { + 'id': 'a2qogef6ba', + 'ext': 'mp4', + 'title': 'a2qogef6ba', + }, + 'skip': 'Requires FrontendMasters account credentials', + }, { + 'url': 'frontendmasters:a2qogef6ba', + 'only_matching': True, + }] + + def _real_extract(self, url): + lesson_id = self._match_id(url) + + source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id) + + formats = [] + for ext in ('webm', 'mp4'): + for quality in ('low', 'mid', 'high'): + resolution = self._QUALITIES[quality].copy() + format_id = '%s-%s' % (ext, quality) + format_url = self._download_json( + source_url, lesson_id, + 'Downloading %s source JSON' % format_id, query={ + 'f': ext, + 'r': resolution['height'], + }, headers={ + 'Referer': url, + }, fatal=False)['url'] + + if not format_url: + continue + + f = resolution.copy() + f.update({ + 'url': format_url, + 'ext': ext, + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = { + 'en': [{ + 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id), + }] + } + + return { + 'id': lesson_id, + 'title': lesson_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class FrontendMastersLessonIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'info_dict': { + 'id': 'a2qogef6ba', + 'display_id': 'tools', + 'ext': 'mp4', + 'title': 'Tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'chapter': 'Introduction', + 'chapter_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires FrontendMasters account credentials', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_name, lesson_name = mobj.group('course_name', 'lesson_name') + + course = self._download_course(course_name, url) + + lesson_id, lesson = next( + (video_id, data) + for video_id, data in course['lessonData'].items() + if data.get('slug') == lesson_name) + + chapters = self._extract_chapters(course) + return self._extract_lesson(chapters, lesson_id, lesson) + + +class FrontendMastersCourseIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/', + 'info_dict': { + 'id': 'web-development', + 'title': 'Introduction to Web Development', + 'description': 'md5:9317e6e842098bf725d62360e52d49a6', + }, + 'playlist_count': 81, + 'skip': 'Requires FrontendMasters account credentials', + } + + @classmethod + def suitable(cls, url): + return False if FrontendMastersLessonIE.suitable(url) else super( + FrontendMastersBaseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + course = self._download_course(course_name, url) + + chapters = self._extract_chapters(course) + + lessons = sorted( + course['lessonData'].values(), key=lambda data: data['index']) + + entries = [] + for lesson in lessons: + lesson_name = lesson.get('slug') + if not lesson_name: + continue + lesson_id = lesson.get('hash') or lesson.get('statsId') + entries.append(self._extract_lesson(chapters, lesson_id, lesson)) + + title = course.get('title') + description = course.get('description') + + return self.playlist_result(entries, course_name, title, description) From e06632e3fe25036b804a62469bb18fa4c37e3368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Jul 2018 08:22:56 +0700 Subject: [PATCH 314/488] [downloader/dash] Improve error handling (#16927) --- youtube_dl/downloader/dash.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 576ece6db..eaa7adf7c 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .fragment import FragmentFD from ..compat import compat_urllib_error -from ..utils import urljoin +from ..utils import ( + DownloadError, + urljoin, +) class DashSegmentsFD(FragmentFD): @@ -57,6 +60,14 @@ class DashSegmentsFD(FragmentFD): count += 1 if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) + except DownloadError: + # Don't retry fragment if error occurred during HTTP downloading + # itself since it has own retry settings + if not fatal: + self.report_skip_fragment(frag_index) + break + raise + if count > fragment_retries: if not fatal: self.report_skip_fragment(frag_index) From 0685d9727b9657fc8a31c96cb52c4155de29fcfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Jul 2018 23:43:05 +0700 Subject: [PATCH 315/488] [utils] Share JSON-LD regex --- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/utils.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78f053f18..5d4db54d5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -52,6 +52,7 @@ from ..utils import ( GeoUtils, int_or_none, js_to_json, + JSON_LD_RE, mimetype2ext, orderedSet, parse_codecs, @@ -1149,8 +1150,7 @@ class InfoExtractor(object): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( - r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', - html, 'JSON-LD', group='json_ld', **kwargs) + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) if not json_ld: return default if default is not NO_DEFAULT else {} diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6a3199fb9..8c45166d7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -184,6 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' def preferredencoding(): From 79fd7320e24596b39d81c2a364fb5b41c2f57b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Jul 2018 23:44:05 +0700 Subject: [PATCH 316/488] [nrktv] Add support for new episode URL schema (closes #16909) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 38 +++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ed532d77f..a20712d34 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -768,6 +768,7 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKTVEpisodeIE, NRKTVEpisodesIE, NRKTVSeriesIE, ) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7157e2390..50dd07d11 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -8,6 +8,7 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + JSON_LD_RE, parse_age_limit, parse_duration, ) @@ -359,6 +360,40 @@ class NRKTVIE(NRKBaseIE): }] +class NRKTVEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', + 'info_dict': { + 'id': 'MSUI14000816AA', + 'ext': 'mp4', + 'title': 'Backstage 8:30', + 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', + 'duration': 1320, + 'series': 'Backstage', + 'season_number': 1, + 'episode_number': 8, + 'episode': '8:30', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + nrk_id = self._parse_json( + self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'), + display_id)['@id'] + + assert re.match(NRKTVIE._EPISODE_RE, nrk_id) + return self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) + + class NRKTVDirekteIE(NRKTVIE): IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' @@ -470,7 +505,8 @@ class NRKTVSeriesIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url) + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): series_id = self._match_id(url) From 4b3ee09886d1f2a096004013e6a8a13a1f564ba8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Jul 2018 00:21:14 +0700 Subject: [PATCH 317/488] [nrktv] Add support for new season and serie URL schema --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 208 ++++++++++++++++++++--------- 2 files changed, 149 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a20712d34..c6f8a785a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -770,6 +770,7 @@ from .nrk import ( NRKTVDirekteIE, NRKTVEpisodeIE, NRKTVEpisodesIE, + NRKTVSeasonIE, NRKTVSeriesIE, ) from .ntvde import NTVDeIE diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 50dd07d11..a231735fb 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,13 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, int_or_none, JSON_LD_RE, + NO_DEFAULT, parse_age_limit, parse_duration, + try_get, ) @@ -394,6 +399,148 @@ class NRKTVEpisodeIE(InfoExtractor): 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) +class NRKTVSerieBaseIE(InfoExtractor): + def _extract_series(self, webpage, display_id, fatal=True): + config = self._parse_json( + self._search_regex( + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config', + default='{}' if not fatal else NO_DEFAULT), + display_id, fatal=False) + if not config: + return + return try_get(config, lambda x: x['series'], dict) + + def _extract_episodes(self, season): + entries = [] + if not isinstance(season, dict): + return entries + episodes = season.get('episodes') + if not isinstance(episodes, list): + return entries + for episode in episodes: + nrk_id = episode.get('prfId') + if not nrk_id or not isinstance(nrk_id, compat_str): + continue + entries.append(self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) + return entries + + +class NRKTVSeasonIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', + 'info_dict': { + 'id': '1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 30, + } + + @classmethod + def suitable(cls, url): + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + else super(NRKTVSeasonIE, cls).suitable(url)) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + series = self._extract_series(webpage, display_id) + + season = next( + s for s in series['seasons'] + if int(display_id) == s.get('seasonNumber')) + + title = try_get(season, lambda x: x['titles']['title'], compat_str) + return self.playlist_result( + self._extract_episodes(season), display_id, title) + + +class NRKTVSeriesIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' + _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' + _TESTS = [{ + # new layout + 'url': 'https://tv.nrk.no/serie/backstage', + 'info_dict': { + 'id': 'backstage', + 'title': 'Backstage', + 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + }, + 'playlist_mincount': 60, + }, { + # old layout + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'md5:58afd450974c89e27d5a19212eee7115', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return ( + False if any(ie.suitable(url) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + else super(NRKTVSeriesIE, cls).suitable(url)) + + def _real_extract(self, url): + series_id = self._match_id(url) + + webpage = self._download_webpage(url, series_id) + + # New layout (e.g. https://tv.nrk.no/serie/backstage) + series = self._extract_series(webpage, series_id, fatal=False) + if series: + title = try_get(series, lambda x: x['titles']['title'], compat_str) + description = try_get( + series, lambda x: x['titles']['subtitle'], compat_str) + entries = [] + for season in series['seasons']: + entries.extend(self._extract_episodes(season)) + return self.playlist_result(entries, series_id, title, description) + + # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) + entries = [ + self.url_result( + 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, season=season_id)) + for season_id in re.findall(self._ITEM_RE, webpage) + ] + + title = self._html_search_meta( + 'seriestitle', webpage, + 'title', default=None) or self._og_search_title( + webpage, fatal=False) + + description = self._html_search_meta( + 'series_description', webpage, + 'description', default=None) or self._og_search_description(webpage) + + return self.playlist_result(entries, series_id, title, description) + + class NRKTVDirekteIE(NRKTVIE): IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' @@ -473,65 +620,6 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) -class NRKTVSeriesIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 9, - }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', - 'info_dict': { - 'id': 'labyrint', - 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', - }, - 'playlist_mincount': 3, - }, { - 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/saving-the-human-race', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/postmann-pat', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) - else super(NRKTVSeriesIE, cls).suitable(url)) - - def _real_extract(self, url): - series_id = self._match_id(url) - - webpage = self._download_webpage(url, series_id) - - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) - - class NRKSkoleIE(InfoExtractor): IE_DESC = 'NRK Skole' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' From 7e8e948cf7eb17de57e95f20d1b3cb963f46f121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Jul 2018 02:08:15 +0700 Subject: [PATCH 318/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index c33bf7777..978a316bc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version <unreleased> + +Core +* [utils] Share JSON-LD regular expression +* [downloader/dash] Improve error handling (#16927) + +Extractors ++ [nrktv] Add support for new season and serie URL schema ++ [nrktv] Add support for new episode URL schema (#16909) ++ [frontendmasters] Add support for frontendmasters.com (#3661, #16328) +* [funk] Fix extraction (#16918) +* [watchbox] Fix extraction (#16904) +* [dplayit] Sort formats +* [dplayit] Fix extraction (#16901) +* [youtube] Improve login error handling (#13822) + + version 2018.07.04 Core From 40a051fa9f48000f311f243c40e3cae588420738 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Jul 2018 02:09:51 +0700 Subject: [PATCH 319/488] release 2018.07.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 453983f84..f192c6633 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.07.04 +[debug] youtube-dl version 2018.07.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 978a316bc..1d602079e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.10 Core * [utils] Share JSON-LD regular expression diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 19dc984dc..6cbe81802 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -302,6 +302,9 @@ - **Freesound** - **freespeech.org** - **FreshLive** + - **FrontendMasters** + - **FrontendMastersCourse** + - **FrontendMastersLesson** - **Funimation** - **FunkChannel** - **FunkMix** @@ -589,7 +592,9 @@ - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte + - **NRKTVEpisode** - **NRKTVEpisodes** + - **NRKTVSeason** - **NRKTVSeries** - **ntv.ru** - **Nuvid** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4cf97291b..c7083cf47 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.04' +__version__ = '2018.07.10' From 79367a98208fbf01d6e04b6747a6e01d0b1f8b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Jul 2018 18:05:06 +0700 Subject: [PATCH 320/488] [pornhub] Improve extraction and extract all formats (closes #12166, closes #15891, closes #16262, closes #16959) --- youtube_dl/extractor/pornhub.py | 125 ++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 23e24d216..97f988da4 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals import functools import itertools import operator -# import os import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - # compat_urllib_parse_unquote, - # compat_urllib_parse_unquote_plus, - # compat_urllib_parse_urlparse, + compat_str, ) from ..utils import ( ExtractorError, int_or_none, js_to_json, orderedSet, - # sanitized_Request, remove_quotes, str_to_int, ) -# from ..aes import ( -# aes_decrypt_text -# ) class PornHubIE(InfoExtractor): @@ -62,7 +55,7 @@ class PornHubIE(InfoExtractor): 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', - 'uploader': 'cj397186295', + 'uploader': 'Unknown', 'duration': 1753, 'view_count': int, 'like_count': int, @@ -121,7 +114,7 @@ class PornHubIE(InfoExtractor): self._set_cookie('pornhub.com', 'platform', platform) return self._download_webpage( 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, - video_id) + video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') @@ -134,48 +127,19 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) - tv_webpage = dl_webpage('tv') - - assignments = self._search_regex( - r'(var.+?mediastring.+?)</script>', tv_webpage, - 'encoded url').split(';') - - js_vars = {} - - def parse_js_value(inp): - inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) - if '+' in inp: - inps = inp.split('+') - return functools.reduce( - operator.concat, map(parse_js_value, inps)) - inp = inp.strip() - if inp in js_vars: - return js_vars[inp] - return remove_quotes(inp) - - for assn in assignments: - assn = assn.strip() - if not assn: - continue - assn = re.sub(r'var\s+', '', assn) - vname, value = assn.split('=', 1) - js_vars[vname] = parse_js_value(value) - - video_url = js_vars['mediastring'] - - title = self._search_regex( - r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) - # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. - title = title or self._html_search_meta( + title = self._html_search_meta( 'twitter:title', webpage, default=None) or self._search_regex( (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), webpage, 'title', group='title') + video_urls = [] + video_urls_set = set() + flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), @@ -183,8 +147,78 @@ class PornHubIE(InfoExtractor): if flashvars: thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) else: - title, thumbnail, duration = [None] * 3 + thumbnail, duration = [None] * 2 + + if not video_urls: + tv_webpage = dl_webpage('tv') + + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + formats = [] + for video_url, height in video_urls: + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', @@ -210,7 +244,6 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'uploader': video_uploader, 'title': title, 'thumbnail': thumbnail, @@ -219,7 +252,7 @@ class PornHubIE(InfoExtractor): 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - # 'formats': formats, + 'formats': formats, 'age_limit': 18, 'tags': tags, 'categories': categories, From 905eef2b06f1e890b1dfd228aa4fa1fa2308d687 Mon Sep 17 00:00:00 2001 From: Jakub Wilk <jwilk@jwilk.net> Date: Wed, 18 Jul 2018 18:47:26 +0200 Subject: [PATCH 321/488] [imgur] Allow digits in filename extension --- youtube_dl/extractor/imgur.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 2901960a5..ecc958a17 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -43,6 +43,9 @@ class ImgurIE(InfoExtractor): }, { 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, + }, { + 'url': 'https://i.imgur.com/crGpqCV.mp4', + 'only_matching': True, }] def _real_extract(self, url): From bd21ead2a20ff16ec8cb10da72526103471069d6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:29:18 +0100 Subject: [PATCH 322/488] [extractor/common] add support for DASH and MSS formats extraction in SMIL manifests --- youtube_dl/extractor/common.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5d4db54d5..b8bbaf81a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1859,9 +1859,7 @@ class InfoExtractor(object): 'height': height, }) formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': + elif src_ext == 'f4m': f4m_url = src_url if not f4m_params: f4m_params = { @@ -1871,9 +1869,13 @@ class InfoExtractor(object): f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1884,7 +1886,6 @@ class InfoExtractor(object): 'width': width, 'height': height, }) - continue return formats From 371dcc1dd4b29001910005c1d3e416db204cc262 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:31:40 +0100 Subject: [PATCH 323/488] [theplatform] add support for theplatform Top-level domain customization(#16977) --- youtube_dl/extractor/theplatform.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index b1a985ff6..e7dc6071c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -32,13 +32,14 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): + _TP_TLD = 'com' def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml( smil_url, video_id, note=note, query={'format': 'SMIL'}, headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') if error_element is not None and error_element.attrib['src'].startswith( - 'http://link.theplatform.com/s/errorFiles/Unavailable.'): + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD): raise ExtractorError(error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( @@ -66,7 +67,7 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles def _download_theplatform_metadata(self, path, video_id): - info_url = 'http://link.theplatform.com/s/%s?format=preview' % path + info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) return self._download_json(info_url, video_id) def _parse_theplatform_metadata(self, info): From 38f1eb0ac3be5d3b61e7722db0024e61cf98eb69 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:33:33 +0100 Subject: [PATCH 324/488] [mediaset] fix extraction(closes #16977) --- youtube_dl/extractor/mediaset.py | 155 +++++++++++++++---------------- 1 file changed, 74 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9f2b60dcc..57f97409d 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -3,75 +3,75 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..compat import compat_str +from .theplatform import ThePlatformBaseIE from ..utils import ( - determine_ext, - parse_duration, - try_get, - unified_strdate, + ExtractorError, + int_or_none, + update_url_query, ) -class MediasetIE(InfoExtractor): +class MediasetIE(ThePlatformBaseIE): + _TP_TLD = 'eu' _VALID_URL = r'''(?x) (?: mediaset:| https?:// - (?:www\.)?video\.mediaset\.it/ + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: (?:video|on-demand)/(?:[^/]+/)+[^/]+_| - player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + player/index\.html\?.*?\bprogramGuid= ) - )(?P<id>[0-9]+) + )(?P<id>[0-9A-Z]{16}) ''' _TESTS = [{ # full episode - 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824', 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'info_dict': { - 'id': '661824', + 'id': 'FAFU000000661824', 'ext': 'mp4', 'title': 'Quarta puntata', - 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1414, - 'creator': 'mediaset', + 'duration': 1414.26, 'upload_date': '20161107', 'series': 'Hello Goodbye', - 'categories': ['reality'], + 'timestamp': 1478532900, + 'uploader': 'Rete 4', + 'uploader_id': 'R4', }, - 'expected_warnings': ['is not a supported codec'], }, { - 'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', - 'md5': '1276f966ac423d16ba255ce867de073e', + 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', + 'md5': '288532f0ad18307705b01e581304cd7b', 'info_dict': { - 'id': '846685', + 'id': 'F309013801000501', 'ext': 'mp4', 'title': 'Puntata del 25 maggio', - 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 6565, - 'creator': 'mediaset', - 'upload_date': '20180525', + 'duration': 6565.007, + 'upload_date': '20180526', 'series': 'Matrix', - 'categories': ['infotainment'], + 'timestamp': 1527326245, + 'uploader': 'Canale 5', + 'uploader_id': 'C5', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip - 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', 'only_matching': True, }, { # iframe simple - 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', 'only_matching': True, }, { # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) - 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, }, { - 'url': 'mediaset:661824', + 'url': 'mediaset:FAFU000000665924', 'only_matching': True, }] @@ -84,61 +84,54 @@ class MediasetIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'https://www.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading media info', query={ - 'id': video_id - })['video'] - - title = video['title'] - media_id = video.get('guid') or video_id - - video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', - video_id, 'Downloading video CDN JSON', query={ - 'streamid': media_id, - 'format': 'json', - })['videoList'] + guid = self._match_id(url) + tp_path = 'PR1GhC/media/guid/2702976343/' + guid + info = self._extract_theplatform_metadata(tp_path, guid) formats = [] - for format_url in video_list: - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - elif ext == 'ism' or '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': determine_ext(format_url), - }) + subtitles = {} + first_e = None + for asset_type in ('SD', 'HD'): + for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'): + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { + 'mbr': 'true', + 'formats': f, + 'assetTypes': asset_type, + }), guid, 'Downloading %s %s SMIL data' % (f, asset_type)) + except ExtractorError as e: + if not first_e: + first_e = e + break + for tp_f in tp_formats: + tp_f['quality'] = 1 if asset_type == 'HD' else 0 + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if first_e and not formats: + raise first_e self._sort_formats(formats) - creator = try_get( - video, lambda x: x['brand-info']['publisher'], compat_str) - category = try_get( - video, lambda x: x['brand-info']['category'], compat_str) - categories = [category] if category else None + fields = [] + for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))): + fields.extend(templ % repl for repl in repls) + feed_data = self._download_json( + 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid, + guid, fatal=False, query={'fields': ','.join(fields)}) + if feed_data: + publish_info = feed_data.get('mediasetprogram$publishInfo') or {} + info.update({ + 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), + 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), + 'series': feed_data.get('mediasetprogram$brandTitle'), + 'uploader': publish_info.get('description'), + 'uploader_id': publish_info.get('channel'), + 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), + }) - return { - 'id': video_id, - 'title': title, - 'description': video.get('short-description'), - 'thumbnail': video.get('thumbnail'), - 'duration': parse_duration(video.get('duration')), - 'creator': creator, - 'upload_date': unified_strdate(video.get('production-date')), - 'webpage_url': video.get('url'), - 'series': video.get('brand-value'), - 'season': video.get('season'), - 'categories': categories, + info.update({ + 'id': guid, 'formats': formats, - } + 'subtitles': subtitles, + }) + return info From c63f5fb8633dda1b0a673c90d537e93497a8d62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 01:59:00 +0700 Subject: [PATCH 325/488] [slutload] Fix and improve extraction (closes #17001) --- youtube_dl/extractor/slutload.py | 57 +++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 6fc2ff60d..661f9e59d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class SlutloadIE(InfoExtractor): - _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', @@ -16,33 +14,52 @@ class SlutloadIE(InfoExtractor): 'title': 'virginie baisee en cam', 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' - } + }, }, { # mobile site 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) - video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', - webpage, 'title').strip() + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') - video_url = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', - webpage, 'thumbnail', fatal=False) + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } - return { + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': 18 - } + 'title': title, + 'age_limit': 18, + }) + return info From 8da17f96803faa35ab19352bdcd2777011d8812a Mon Sep 17 00:00:00 2001 From: bato3 <bato3@bandyci.org> Date: Wed, 18 Jul 2018 21:04:05 +0200 Subject: [PATCH 326/488] [dailymotion] Improve description extraction (closes #16984) --- youtube_dl/extractor/dailymotion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9a74906cb..8f5f57b98 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -144,7 +144,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): age_limit = self._rta_search(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( 'description', webpage, 'description') view_count_str = self._search_regex( From 11330f5121732f80e3d6ba1c34102955427eb04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 02:25:19 +0700 Subject: [PATCH 327/488] [facebook] Extract view count and update tests (closes #16942) --- youtube_dl/extractor/facebook.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8a9ed96c2..f78479b92 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -20,6 +20,7 @@ from ..utils import ( int_or_none, js_to_json, limit_length, + parse_count, sanitized_Request, try_get, urlencode_postdata, @@ -75,7 +76,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt posted a video to his Timeline.', + 'title': 're:^Asif Nawab Butt posted a video', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, @@ -133,7 +134,7 @@ class FacebookIE(InfoExtractor): }, { # have 1080P, but only up to 720p in swf params 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '0d9813160b146b3bc8744e006027fcc6', + 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -142,6 +143,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20161030', 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', + 'view_count': int, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -149,7 +151,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'md5:a7b86ca673f51800cd54687b7f4012fe', + 'title': 'md5:1db063d6a8c13faa8da727817339c857', 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', @@ -176,7 +178,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1396382447100162', 'ext': 'mp4', - 'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3', + 'title': 'md5:19a428bbde91364e3de815383b54a235', 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', @@ -426,6 +428,10 @@ class FacebookIE(InfoExtractor): 'timestamp', default=None)) thumbnail = self._og_search_thumbnail(webpage) + view_count = parse_count(self._search_regex( + r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None)) + info_dict = { 'id': video_id, 'title': video_title, @@ -433,6 +439,7 @@ class FacebookIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'thumbnail': thumbnail, + 'view_count': view_count, } return webpage, info_dict From 6fc09f0155bae1fd0d3edf31111b37012875b6f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 23:14:20 +0700 Subject: [PATCH 328/488] [vimeo] Add another config regex (closes #17013) --- youtube_dl/extractor/vimeo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 3baa2d075..e49b233f2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -539,9 +539,10 @@ class VimeoIE(VimeoBaseInfoExtractor): # We try to find out to which variable is assigned the config dic m_variable_name = re.search(r'(\w)\.video\.id', webpage) if m_variable_name is not None: - config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) + config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') config = self._search_regex(config_re, webpage, 'info section', flags=re.DOTALL) config = json.loads(config) From c258570eddeafdef23221326e1961c81934f5297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 00:01:43 +0700 Subject: [PATCH 329/488] [viu] Pass Referer and Origin headers (closes #16992) --- youtube_dl/extractor/viu.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index 5cf93591c..e268f9409 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -214,6 +214,9 @@ class ViuOTTIE(InfoExtractor): 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, video_id, 'Downloading stream info', query={ 'ccs_product_id': video_data['ccs_product_id'], + }, headers={ + 'Referer': url, + 'Origin': re.search(r'https?://[^/]+', url).group(0), })['data']['stream'] stream_sizes = stream_data.get('size', {}) From ecb6b6ae2df15cb777fbffdfb8058affa8918f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 00:46:50 +0700 Subject: [PATCH 330/488] [viu] Pass area id --- youtube_dl/extractor/viu.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index e268f9409..3bd37525b 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -195,16 +195,29 @@ class ViuOTTIE(InfoExtractor): 'skip': 'Geo-restricted to Hong Kong', }] + _AREA_ID = { + 'HK': 1, + 'SG': 2, + 'TH': 4, + 'PH': 5, + } + def _real_extract(self, url): country_code, video_id = re.match(self._VALID_URL, url).groups() + query = { + 'r': 'vod/ajax-detail', + 'platform_flag_label': 'web', + 'product_id': video_id, + } + + area_id = self._AREA_ID.get(country_code.upper()) + if area_id: + query['area_id'] = area_id + product_data = self._download_json( 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, - 'Downloading video info', query={ - 'r': 'vod/ajax-detail', - 'platform_flag_label': 'web', - 'product_id': video_id, - })['data'] + 'Downloading video info', query=query)['data'] video_data = product_data.get('current_product') if not video_data: From 25586c601c46768271070c61af76e7fa1d196890 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 00:48:50 +0700 Subject: [PATCH 331/488] [theplatform] PEP 8 [ci skip] --- youtube_dl/extractor/theplatform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index e7dc6071c..411b1f874 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -33,6 +33,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): _TP_TLD = 'com' + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml( smil_url, video_id, note=note, query={'format': 'SMIL'}, From fd62b36680ff7d7bea789ac0031a33fc2d9270ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Jul 2018 02:39:20 +0700 Subject: [PATCH 332/488] [vrtnu] Relax title extraction and extract JSON-LD (closes #17018) --- youtube_dl/extractor/canvas.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8ac62c1a6..174fd9e2b 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -11,6 +11,7 @@ from ..utils import ( strip_or_none, float_or_none, int_or_none, + merge_dicts, parse_iso8601, ) @@ -248,9 +249,13 @@ class VrtNUIE(GigyaBaseIE): webpage, urlh = self._download_webpage_handle(url, display_id) - title = self._html_search_regex( + info = self._search_json_ld(webpage, display_id, default={}) + + # title is optional here since it may be extracted by extractor + # that is delegated from here + title = strip_or_none(self._html_search_regex( r'(?ms)<h1 class="content__heading">(.+?)</h1>', - webpage, 'title').strip() + webpage, 'title', default=None)) description = self._html_search_regex( r'(?ms)<div class="content__description">(.+?)</div>', @@ -295,7 +300,7 @@ class VrtNUIE(GigyaBaseIE): # the first one video_id = list(video.values())[0].get('videoid') - return { + return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), @@ -307,4 +312,4 @@ class VrtNUIE(GigyaBaseIE): 'season_number': season_number, 'episode_number': episode_number, 'release_date': release_date, - } + }) From e9c671d5e86e43785382ae9cb20c8e7676c7c9bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 12:30:18 +0700 Subject: [PATCH 333/488] [utils] Allow JSONP with empty func name (closes #17028) --- test/test_utils.py | 4 ++++ youtube_dl/utils.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index e63af0166..de841b1a0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -717,6 +717,10 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) + stripped = strip_jsonp('({"status": "success"});') + d = json.loads(stripped) + self.assertEqual(d, {'status': 'success'}) + def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8c45166d7..b8700efcb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2282,7 +2282,7 @@ def parse_age_limit(s): def strip_jsonp(code): return re.sub( r'''(?sx)^ - (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+) + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) (?:\s*&&\s*(?P=func_name))? \s*\(\s*(?P<callback_data>.*)\);? \s*?(?://[^\n]*)*$''', From edb0e17188a1197afae1a0d594e4aab7d27bbcf2 Mon Sep 17 00:00:00 2001 From: Kazuma Takahara <4269kzm@gmail.com> Date: Sat, 21 Jul 2018 19:41:33 +0900 Subject: [PATCH 334/488] [iwara] Fix download URLs (closes #17026) --- youtube_dl/extractor/iwara.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index a7514fc80..250140d91 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -77,7 +77,7 @@ class IwaraIE(InfoExtractor): height = int_or_none(self._search_regex( r'(\d+)p', format_id, 'height', default=None)) formats.append({ - 'url': a_format['uri'], + 'url': self._proto_relative_url(a_format['uri'], 'https:'), 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, From b96b4be4619b1e090650212380a92fb068f2fd21 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 21 Jul 2018 11:49:55 +0100 Subject: [PATCH 335/488] [bbc] add support for BBC Radio Play pages(closes #17022) --- youtube_dl/extractor/bbc.py | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 293d82b0f..641bf6073 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -778,6 +778,17 @@ class BBCIE(BBCCoUkIE): 'params': { 'skip_download': True, } + }, { + # window.__PRELOADED_STATE__ + 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', + 'info_dict': { + 'id': 'b0b9z4vz', + 'ext': 'mp4', + 'title': 'Prom 6: An American in Paris and Turangalila', + 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', + 'uploader': 'Radio 3', + 'uploader_id': 'bbc_radio_three', + }, }] @classmethod @@ -1000,6 +1011,36 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + preload_state = self._parse_json(self._search_regex( + r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if preload_state: + current_programme = preload_state.get('programmes', {}).get('current') or {} + programme_id = current_programme.get('id') + if current_programme and programme_id and current_programme.get('type') == 'playable_item': + title = current_programme.get('titles', {}).get('tertiary') or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + synopses = current_programme.get('synopses') or {} + network = current_programme.get('network') or {} + duration = int_or_none( + current_programme.get('duration', {}).get('value')) + thumbnail = None + image_url = current_programme.get('image_url') + if image_url: + thumbnail = image_url.replace('{recipe}', '1920x1920') + return { + 'id': programme_id, + 'title': title, + 'description': dict_get(synopses, ('long', 'medium', 'short')), + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': network.get('short_title'), + 'uploader_id': network.get('id'), + 'formats': formats, + 'subtitles': subtitles, + } + bbc3_config = self._parse_json( self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, From af03000ad5a445f03fbacb63ce626f8dcfe785c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 18:01:06 +0700 Subject: [PATCH 336/488] [utils] Introduce url_or_none --- test/test_utils.py | 11 +++++++++++ youtube_dl/utils.py | 7 +++++++ 2 files changed, 18 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index de841b1a0..8da5ccc56 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -78,6 +78,7 @@ from youtube_dl.utils import ( uppercase_escape, lowercase_escape, url_basename, + url_or_none, base_url, urljoin, urlencode_postdata, @@ -507,6 +508,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + def test_url_or_none(self): + self.assertEqual(url_or_none(None), None) + self.assertEqual(url_or_none(''), None) + self.assertEqual(url_or_none('foo'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de') + self.assertEqual(url_or_none('http$://foo.de'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('//foo.de'), '//foo.de') + def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) self.assertEqual(parse_age_limit(False), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b8700efcb..b84436ed6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1866,6 +1866,13 @@ def strip_or_none(v): return None if v is None else v.strip() +def url_or_none(url): + if not url or not isinstance(url, compat_str): + return None + url = url.strip() + return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + + def parse_duration(s): if not isinstance(s, compat_basestring): return None From 4ecf300d13a6503ae80b76e01047b41d86ab4d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 18:02:41 +0700 Subject: [PATCH 337/488] [iwara] Improve extraction --- youtube_dl/extractor/iwara.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 250140d91..907d5fc8b 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, mimetype2ext, remove_end, + url_or_none, ) @@ -73,11 +74,14 @@ class IwaraIE(InfoExtractor): formats = [] for a_format in video_data: + format_uri = url_or_none(a_format.get('uri')) + if not format_uri: + continue format_id = a_format.get('resolution') height = int_or_none(self._search_regex( r'(\d+)p', format_id, 'height', default=None)) formats.append({ - 'url': self._proto_relative_url(a_format['uri'], 'https:'), + 'url': self._proto_relative_url(format_uri, 'https:'), 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, From 3052a30d4259b182904e5d2430077039461745bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 19:08:28 +0700 Subject: [PATCH 338/488] Improve URL extraction --- youtube_dl/extractor/adultswim.py | 3 ++- youtube_dl/extractor/afreecatv.py | 3 ++- youtube_dl/extractor/amp.py | 15 ++++++++------- youtube_dl/extractor/animeondemand.py | 3 ++- youtube_dl/extractor/aol.py | 3 ++- youtube_dl/extractor/apa.py | 6 +++--- youtube_dl/extractor/aparat.py | 3 ++- youtube_dl/extractor/ard.py | 4 ++-- youtube_dl/extractor/bandcamp.py | 7 ++++--- youtube_dl/extractor/breakcom.py | 10 ++++++---- youtube_dl/extractor/cammodels.py | 6 +++--- youtube_dl/extractor/ccma.py | 6 +++--- youtube_dl/extractor/crackle.py | 14 ++++++-------- youtube_dl/extractor/dctp.py | 7 ++++--- youtube_dl/extractor/discoverygo.py | 7 +++---- youtube_dl/extractor/dramafever.py | 10 +++++----- youtube_dl/extractor/eagleplatform.py | 8 +++----- youtube_dl/extractor/egghead.py | 8 +++++--- youtube_dl/extractor/eporner.py | 5 +++-- youtube_dl/extractor/firsttv.py | 5 +++-- youtube_dl/extractor/francetv.py | 8 ++++---- youtube_dl/extractor/frontendmasters.py | 3 ++- youtube_dl/extractor/generic.py | 5 +++-- youtube_dl/extractor/hidive.py | 10 +++++----- youtube_dl/extractor/imdb.py | 6 +++--- youtube_dl/extractor/instagram.py | 3 ++- youtube_dl/extractor/itv.py | 5 +++-- youtube_dl/extractor/keezmovies.py | 9 ++++----- youtube_dl/extractor/konserthusetplay.py | 5 +++-- youtube_dl/extractor/mediasite.py | 5 +++-- youtube_dl/extractor/peertube.py | 5 +++-- youtube_dl/extractor/redtube.py | 6 +++--- youtube_dl/extractor/rentv.py | 5 +++-- youtube_dl/extractor/rutube.py | 5 +++-- youtube_dl/extractor/turner.py | 5 +++-- youtube_dl/extractor/tvnet.py | 7 +++---- youtube_dl/extractor/tvplay.py | 4 +++- youtube_dl/extractor/twitch.py | 5 +++-- youtube_dl/extractor/udemy.py | 13 +++++++------ youtube_dl/extractor/vidme.py | 10 ++++------ youtube_dl/extractor/vk.py | 4 +++- youtube_dl/extractor/xhamster.py | 7 +++++-- youtube_dl/extractor/yapfiles.py | 8 ++++---- youtube_dl/extractor/youjizz.py | 6 +++--- youtube_dl/extractor/youporn.py | 6 +++--- youtube_dl/extractor/zattoo.py | 5 +++-- youtube_dl/extractor/zdf.py | 12 +++++++----- 47 files changed, 166 insertions(+), 139 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index acc4ce38d..88c96a950 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -7,6 +7,7 @@ from .turner import TurnerBaseIE from ..utils import ( int_or_none, strip_or_none, + url_or_none, ) @@ -98,7 +99,7 @@ class AdultSwimIE(TurnerBaseIE): if not video_id: entries = [] for episode in video_data.get('archiveEpisodes', []): - episode_url = episode.get('url') + episode_url = url_or_none(episode.get('url')) if not episode_url: continue entries.append(self.url_result( diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 4b3d97136..6275e5209 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + url_or_none, urlencode_postdata, xpath_text, ) @@ -304,7 +305,7 @@ class AfreecaTVIE(InfoExtractor): file_elements = video_element.findall(compat_xpath('./file')) one = len(file_elements) == 1 for file_num, file_element in enumerate(file_elements, start=1): - file_url = file_element.text + file_url = url_or_none(file_element.text) if not file_url: continue key = file_element.get('key', '') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index fde1a8ff7..7ff098cfa 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -3,11 +3,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - int_or_none, - parse_iso8601, - mimetype2ext, determine_ext, ExtractorError, + int_or_none, + mimetype2ext, + parse_iso8601, + url_or_none, ) @@ -35,7 +36,7 @@ class AMPIE(InfoExtractor): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: thumbnail = thumbnail_data.get('@attributes', {}) - thumbnail_url = thumbnail.get('url') + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ @@ -51,7 +52,7 @@ class AMPIE(InfoExtractor): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: subtitle = subtitle_data.get('@attributes', {}) - subtitle_href = subtitle.get('href') + subtitle_href = url_or_none(subtitle.get('href')) if not subtitle_href: continue subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ @@ -65,7 +66,7 @@ class AMPIE(InfoExtractor): media_content = [media_content] for media_data in media_content: media = media_data.get('@attributes', {}) - media_url = media.get('url') + media_url = url_or_none(media.get('url')) if not media_url: continue ext = mimetype2ext(media.get('type')) or determine_ext(media_url) @@ -79,7 +80,7 @@ class AMPIE(InfoExtractor): else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), - 'url': media['url'], + 'url': media_url, 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), 'ext': ext, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 1fe5d5e56..00ce684d1 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + url_or_none, urlencode_postdata, urljoin, ) @@ -165,7 +166,7 @@ class AnimeOnDemandIE(InfoExtractor): }, fatal=False) if not playlist: continue - stream_url = playlist.get('streamurl') + stream_url = url_or_none(playlist.get('streamurl')) if stream_url: rtmp = re.search( r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b50f454ee..cb9279193 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + url_or_none, ) @@ -77,7 +78,7 @@ class AolIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) for rendition in video_data.get('renditions', []): - video_url = rendition.get('url') + video_url = url_or_none(rendition.get('url')) if not video_url: continue ext = rendition.get('format') diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index a30a935aa..98ccdaa4a 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, js_to_json, + url_or_none, ) @@ -68,8 +68,8 @@ class APAIE(InfoExtractor): for source in sources: if not isinstance(source, dict): continue - source_url = source.get('file') - if not source_url or not isinstance(source_url, compat_str): + source_url = url_or_none(source.get('file')) + if not source_url: continue ext = determine_ext(source_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index e394cb661..6eb8bbb6e 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, mimetype2ext, + url_or_none, ) @@ -43,7 +44,7 @@ class AparatIE(InfoExtractor): formats = [] for item in file_list[0]: - file_url = item.get('file') + file_url = url_or_none(item.get('file')) if not file_url: continue ext = mimetype2ext(item.get('type')) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 86951d975..23f574d36 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from .generic import GenericIE -from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, @@ -15,6 +14,7 @@ from ..utils import ( unified_strdate, xpath_text, update_url_query, + url_or_none, ) from ..compat import compat_etree_fromstring @@ -100,7 +100,7 @@ class ARDMediathekIE(InfoExtractor): quality = stream.get('_quality') server = stream.get('_server') for stream_url in stream_urls: - if not isinstance(stream_url, compat_str) or '//' not in stream_url: + if not url_or_none(stream_url): continue ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index be41bd5a2..b8514734d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -19,6 +19,7 @@ from ..utils import ( unescapeHTML, update_url_query, unified_strdate, + url_or_none, ) @@ -131,8 +132,8 @@ class BandcampIE(InfoExtractor): fatal=False) if not stat: continue - retry_url = stat.get('retry_url') - if not isinstance(retry_url, compat_str): + retry_url = url_or_none(stat.get('retry_url')) + if not retry_url: continue formats.append({ 'url': self._proto_relative_url(retry_url, 'http:'), @@ -306,7 +307,7 @@ class BandcampWeeklyIE(InfoExtractor): formats = [] for format_id, format_url in show['audio_stream'].items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue for known_ext in KNOWN_EXTENSIONS: if known_ext in format_id: diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 70d16767f..68c7cf2bb 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + url_or_none, +) class BreakIE(InfoExtractor): @@ -55,8 +57,8 @@ class BreakIE(InfoExtractor): formats = [] for video in content: - video_url = video.get('url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video.get('url')) + if not video_url: continue bitrate = int_or_none(self._search_regex( r'(\d+)_kbps', video_url, 'tbr', default=None)) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index ee0165dba..79350817f 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -2,10 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + url_or_none, ) @@ -56,8 +56,8 @@ class CamModelsIE(InfoExtractor): for media in encodings: if not isinstance(media, dict): continue - media_url = media.get('location') - if not media_url or not isinstance(media_url, compat_str): + media_url = url_or_none(media.get('location')) + if not media_url: continue format_id_list = [format_id] diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 07f5206c1..544647f92 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, int_or_none, parse_duration, parse_iso8601, parse_resolution, + url_or_none, ) @@ -53,8 +53,8 @@ class CCMAIE(InfoExtractor): media_url = media['media']['url'] if isinstance(media_url, list): for format_ in media_url: - format_url = format_.get('file') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_.get('file')) + if not format_url: continue label = format_.get('label') f = parse_resolution(label) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f4a616455..8dd9d6687 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -4,16 +4,14 @@ from __future__ import unicode_literals, division import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, float_or_none, int_or_none, parse_age_limit, parse_duration, + url_or_none, ExtractorError ) @@ -86,8 +84,8 @@ class CrackleIE(InfoExtractor): for e in media['MediaURLs']: if e.get('UseDRM') is True: continue - format_url = e.get('Path') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(e.get('Path')) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': @@ -124,8 +122,8 @@ class CrackleIE(InfoExtractor): for cc_file in cc_files: if not isinstance(cc_file, dict): continue - cc_url = cc_file.get('Path') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc_file.get('Path')) + if not cc_url: continue lang = cc_file.get('Locale') or 'en' subtitles.setdefault(lang, []).append({'url': cc_url}) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index dc0c41b8a..769a219df 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -7,6 +7,7 @@ from ..utils import ( float_or_none, int_or_none, unified_timestamp, + url_or_none, ) @@ -69,7 +70,7 @@ class DctpTvIE(InfoExtractor): endpoint = next( server['endpoint'] for server in servers - if isinstance(server.get('endpoint'), compat_str) and + if url_or_none(server.get('endpoint')) and 'cloudfront' in server['endpoint']) else: endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' @@ -92,8 +93,8 @@ class DctpTvIE(InfoExtractor): for image in images: if not isinstance(image, dict): continue - image_url = image.get('url') - if not image_url or not isinstance(image_url, compat_str): + image_url = url_or_none(image.get('url')) + if not image_url: continue thumbnails.append({ 'url': image_url, diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 3368c4c07..9e7b14a7d 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, @@ -12,6 +11,7 @@ from ..utils import ( parse_age_limit, remove_end, unescapeHTML, + url_or_none, ) @@ -69,9 +69,8 @@ class DiscoveryGoBaseIE(InfoExtractor): captions = stream.get('captions') if isinstance(captions, list): for caption in captions: - subtitle_url = caption.get('fileUrl') - if (not subtitle_url or not isinstance(subtitle_url, compat_str) or - not subtitle_url.startswith('http')): + subtitle_url = url_or_none(caption.get('fileUrl')) + if not subtitle_url or not subtitle_url.startswith('http'): continue lang = caption.get('fileLang', 'en') ext = determine_ext(subtitle_url) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ab32ba4ff..db1de699f 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -7,7 +7,6 @@ import json from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_str, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( parse_age_limit, parse_duration, unified_timestamp, + url_or_none, ) @@ -139,8 +139,8 @@ class DramaFeverIE(DramaFeverBaseIE): for sub in subs: if not isinstance(sub, dict): continue - sub_url = sub.get('url') - if not sub_url or not isinstance(sub_url, compat_str): + sub_url = url_or_none(sub.get('url')) + if not sub_url: continue subtitles.setdefault( sub.get('code') or sub.get('language') or 'en', []).append({ @@ -163,8 +163,8 @@ class DramaFeverIE(DramaFeverBaseIE): for format_id, format_dict in download_assets.items(): if not isinstance(format_dict, dict): continue - format_url = format_dict.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_dict.get('url')) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 42789278e..36fef07b7 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,14 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, unsmuggle_url, + url_or_none, ) @@ -177,7 +175,7 @@ class EaglePlatformIE(InfoExtractor): video_id, 'Downloading mp4 JSON', fatal=False) if mp4_data: for format_id, format_url in mp4_data.get('data', {}).items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue height = int_or_none(format_id) if height is not None and m3u8_formats_dict.get(height): diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index edabaafe6..df11dc206 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) @@ -34,8 +35,8 @@ class EggheadCourseIE(InfoExtractor): entries = [] for lesson in lessons: - lesson_url = lesson.get('http_url') - if not lesson_url or not isinstance(lesson_url, compat_str): + lesson_url = url_or_none(lesson.get('http_url')) + if not lesson_url: continue lesson_id = lesson.get('id') if lesson_id: @@ -95,7 +96,8 @@ class EggheadLessonIE(InfoExtractor): formats = [] for _, format_url in lesson['media_urls'].items(): - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 81f2e2ee1..6d03d7095 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_duration, str_to_int, + url_or_none, ) @@ -82,8 +83,8 @@ class EpornerIE(InfoExtractor): for format_id, format_dict in formats_dict.items(): if not isinstance(format_dict, dict): continue - src = format_dict.get('src') - if not isinstance(src, compat_str) or not src.startswith('http'): + src = url_or_none(format_dict.get('src')) + if not src or not src.startswith('http'): continue if kind == 'hls': formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 4803a22c8..28617d83c 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, qualities, unified_strdate, + url_or_none, ) @@ -88,8 +89,8 @@ class FirstTVIE(InfoExtractor): formats = [] path = None for f in item.get('mbr', []): - src = f.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(f.get('src')) + if not src: continue tbr = int_or_none(self._search_regex( r'_(\d{3,})\.mp4', src, 'tbr', default=None)) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 6fc6b0da0..2ffe83a78 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -16,6 +16,7 @@ from ..utils import ( int_or_none, parse_duration, try_get, + url_or_none, ) from .dailymotion import DailymotionIE @@ -115,14 +116,13 @@ class FranceTVIE(InfoExtractor): def sign(manifest_url, manifest_id): for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): - signed_url = self._download_webpage( + signed_url = url_or_none(self._download_webpage( 'https://%s/esi/TA' % host, video_id, 'Downloading signed %s manifest URL' % manifest_id, fatal=False, query={ 'url': manifest_url, - }) - if (signed_url and isinstance(signed_url, compat_str) and - re.search(r'^(?:https?:)?//', signed_url)): + })) + if signed_url: return signed_url return manifest_url diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py index 770db46d0..cb57ba007 100644 --- a/youtube_dl/extractor/frontendmasters.py +++ b/youtube_dl/extractor/frontendmasters.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, + url_or_none, urlencode_postdata, ) @@ -80,7 +81,7 @@ class FrontendMastersPageBaseIE(FrontendMastersBaseIE): chapters = [] lesson_elements = course.get('lessonElements') if isinstance(lesson_elements, list): - chapters = [e for e in lesson_elements if isinstance(e, compat_str)] + chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)] return chapters @staticmethod diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index aa04905ed..e5a8ffbe8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -32,6 +32,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, UnsupportedError, + url_or_none, xpath_text, ) from .commonprotocols import RtmpIE @@ -3130,8 +3131,8 @@ class GenericIE(InfoExtractor): sources = [sources] formats = [] for source in sources: - src = source.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(source.get('src')) + if not src: continue src = compat_urlparse.urljoin(url, src) src_type = source.get('type') diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index 39fabe8a5..f26f80265 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + url_or_none, urlencode_postdata, ) @@ -80,8 +81,8 @@ class HiDiveIE(InfoExtractor): bitrates = rendition.get('bitrates') if not isinstance(bitrates, dict): continue - m3u8_url = bitrates.get('hls') - if not isinstance(m3u8_url, compat_str): + m3u8_url = url_or_none(bitrates.get('hls')) + if not m3u8_url: continue formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', @@ -93,9 +94,8 @@ class HiDiveIE(InfoExtractor): if not isinstance(cc_file, list) or len(cc_file) < 3: continue cc_lang = cc_file[0] - cc_url = cc_file[2] - if not isinstance(cc_lang, compat_str) or not isinstance( - cc_url, compat_str): + cc_url = url_or_none(cc_file[2]) + if not isinstance(cc_lang, compat_str) or not cc_url: continue subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4bafa54a2..fba01ef49 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, mimetype2ext, parse_duration, qualities, + url_or_none, ) @@ -61,8 +61,8 @@ class ImdbIE(InfoExtractor): for encoding in video_metadata.get('encodings', []): if not encoding or not isinstance(encoding, dict): continue - video_url = encoding.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(encoding.get('videoUrl')) + if not video_url: continue ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) if ext == 'm3u8': diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 0c13f54ee..7e0e838f0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -17,6 +17,7 @@ from ..utils import ( lowercase_escape, std_headers, try_get, + url_or_none, ) @@ -170,7 +171,7 @@ class InstagramIE(InfoExtractor): node = try_get(edge, lambda x: x['node'], dict) if not node: continue - node_video_url = try_get(node, lambda x: x['video_url'], compat_str) + node_video_url = url_or_none(node.get('video_url')) if not node_video_url: continue entries.append({ diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index d05a7b68d..de65b6bb4 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -20,6 +20,7 @@ from ..utils import ( merge_dicts, parse_duration, smuggle_url, + url_or_none, xpath_with_ns, xpath_element, xpath_text, @@ -250,8 +251,8 @@ class ITVIE(InfoExtractor): for sub in subs: if not isinstance(sub, dict): continue - href = sub.get('Href') - if isinstance(href, compat_str): + href = url_or_none(sub.get('Href')) + if href: extract_subtitle(href) if not info.get('duration'): info['duration'] = parse_duration(video_data.get('Duration')) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index d4e6f7ac1..c3eb74c17 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -4,16 +4,14 @@ import re from .common import InfoExtractor from ..aes import aes_decrypt_text -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, ExtractorError, int_or_none, str_to_int, strip_or_none, + url_or_none, ) @@ -55,7 +53,8 @@ class KeezMoviesIE(InfoExtractor): encrypted = False def extract_format(format_url, height=None): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//')): return if format_url in format_urls: return diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index c11cbcf47..dd42bb2f2 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, + url_or_none, ) @@ -109,7 +109,8 @@ class KonserthusetPlayIE(InfoExtractor): captions = source.get('captionsAvailableLanguages') if isinstance(captions, dict): for lang, subtitle_url in captions.items(): - if lang != 'none' and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(subtitle_url) + if lang != 'none' and subtitle_url: subtitles.setdefault(lang, []).append({'url': subtitle_url}) return { diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 0e2645c55..84876b883 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -15,6 +15,7 @@ from ..utils import ( mimetype2ext, unescapeHTML, unsmuggle_url, + url_or_none, urljoin, ) @@ -156,8 +157,8 @@ class MediasiteIE(InfoExtractor): stream_formats = [] for unum, VideoUrl in enumerate(video_urls): - video_url = VideoUrl.get('Location') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(VideoUrl.get('Location')) + if not video_url: continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index d9849a2ba..e03c3d1d3 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -10,6 +10,7 @@ from ..utils import ( parse_resolution, try_get, unified_timestamp, + url_or_none, urljoin, ) @@ -200,8 +201,8 @@ class PeerTubeIE(InfoExtractor): for file_ in video['files']: if not isinstance(file_, dict): continue - file_url = file_.get('fileUrl') - if not file_url or not isinstance(file_url, compat_str): + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: continue file_size = int_or_none(file_.get('size')) format_id = try_get( diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 879bcf81d..10311a81a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, str_to_int, unified_strdate, + url_or_none, ) @@ -71,8 +71,8 @@ class RedTubeIE(InfoExtractor): video_id, fatal=False) if medias and isinstance(medias, list): for media in medias: - format_url = media.get('videoUrl') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(media.get('videoUrl')) + if not format_url: continue format_id = media.get('quality') formats.append({ diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index 8bcf87126..7c8909d95 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -6,6 +6,7 @@ from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, + url_or_none, ) @@ -37,8 +38,8 @@ class RENTVIE(InfoExtractor): title = config['title'] formats = [] for video in config['src']: - src = video.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(video.get('src')) + if not src: continue ext = determine_ext(src) if ext == 'm3u8': diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 89d89b65a..261bcbb83 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -16,6 +16,7 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) @@ -176,8 +177,8 @@ class RutubePlaylistBaseIE(RutubeBaseIE): break for result in results: - video_url = result.get('video_url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(result.get('video_url')) + if not video_url: continue entry = self._extract_video(result, require_title=False) entry.update({ diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 2b7b0d6e1..4a6cbfbb8 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -15,6 +15,7 @@ from ..utils import ( update_url_query, ExtractorError, strip_or_none, + url_or_none, ) @@ -154,8 +155,8 @@ class TurnerBaseIE(AdobePassIE): subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): - track_url = track.get('url') - if not isinstance(track_url, compat_str) or track_url.endswith('/big'): + track_url = url_or_none(track.get('url')) + if not track_url or track_url.endswith('/big'): continue lang = track.get('lang') or track.get('label') or 'en' subtitles.setdefault(lang, []).append({ diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py index 2b2630b91..4222ff9ee 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/youtube_dl/extractor/tvnet.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, unescapeHTML, + url_or_none, ) @@ -106,9 +106,8 @@ class TVNetIE(InfoExtractor): for stream in self._download_json(data_file, video_id): if not isinstance(stream, dict): continue - stream_url = stream.get('url') - if (stream_url in stream_urls or not stream_url or - not isinstance(stream_url, compat_str)): + stream_url = url_or_none(stream.get('url')) + if stream_url in stream_urls or not stream_url: continue stream_urls.add(stream_url) formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index e09b5f804..d3adab457 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -19,6 +19,7 @@ from ..utils import ( try_get, unsmuggle_url, update_url_query, + url_or_none, ) @@ -255,7 +256,8 @@ class TVPlayIE(InfoExtractor): quality = qualities(['hls', 'medium', 'high']) formats = [] for format_id, video_url in streams.get('streams', {}).items(): - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video_url) + if not video_url: continue ext = determine_ext(video_url) if ext == 'f4m': diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index e01f11331..89ee44224 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -27,6 +27,7 @@ from ..utils import ( unified_timestamp, update_url_query, urlencode_postdata, + url_or_none, urljoin, ) @@ -663,8 +664,8 @@ class TwitchClipsIE(TwitchBaseIE): for option in status['quality_options']: if not isinstance(option, dict): continue - source = option.get('source') - if not source or not isinstance(source, compat_str): + source = url_or_none(option.get('source')) + if not source: continue formats.append({ 'url': source, diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index a7196997e..79c45f80e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -20,6 +20,7 @@ from ..utils import ( sanitized_Request, try_get, unescapeHTML, + url_or_none, urlencode_postdata, ) @@ -265,8 +266,8 @@ class UdemyIE(InfoExtractor): if not isinstance(source_list, list): return for source in source_list: - video_url = source.get('file') or source.get('src') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(source.get('file') or source.get('src')) + if not video_url: continue if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -293,8 +294,8 @@ class UdemyIE(InfoExtractor): continue if track.get('kind') != 'captions': continue - src = track.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(track.get('src')) + if not src: continue lang = track.get('language') or track.get( 'srclang') or track.get('label') @@ -314,8 +315,8 @@ class UdemyIE(InfoExtractor): for cc in captions: if not isinstance(cc, dict): continue - cc_url = cc.get('url') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc.get('url')) + if not cc_url: continue lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) sub_dict = (automatic_captions if cc.get('source') == 'auto' diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 59adb2377..174e69cd6 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -3,15 +3,13 @@ from __future__ import unicode_literals import itertools from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, float_or_none, parse_iso8601, + url_or_none, ) @@ -166,8 +164,8 @@ class VidmeIE(InfoExtractor): formats = [] for f in video.get('formats', []): - format_url = f.get('uri') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(f.get('uri')) + if not format_url: continue format_type = f.get('type') if format_type == 'dash': diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 29002b35f..48b5987c2 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -20,6 +20,7 @@ from ..utils import ( str_to_int, unescapeHTML, unified_timestamp, + url_or_none, urlencode_postdata, ) from .dailymotion import DailymotionIE @@ -423,7 +424,8 @@ class VKIE(VKBaseIE): formats = [] for format_id, format_url in data.items(): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//', 'rtmp')): continue if (format_id.startswith(('url', 'cache')) or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index d1bc992fd..68a48034e 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,6 +13,7 @@ from ..utils import ( parse_duration, try_get, unified_strdate, + url_or_none, ) @@ -137,7 +138,8 @@ class XHamsterIE(InfoExtractor): else: format_url = format_item filesize = None - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue formats.append({ 'format_id': '%s-%s' % (format_id, quality), @@ -198,7 +200,8 @@ class XHamsterIE(InfoExtractor): default='{}'), video_id, fatal=False) for format_id, format_url in sources.items(): - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue if format_url in format_urls: continue diff --git a/youtube_dl/extractor/yapfiles.py b/youtube_dl/extractor/yapfiles.py index 7fafbf596..cfb368de9 100644 --- a/youtube_dl/extractor/yapfiles.py +++ b/youtube_dl/extractor/yapfiles.py @@ -4,12 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, qualities, unescapeHTML, + url_or_none, ) @@ -80,9 +80,9 @@ class YapFilesIE(InfoExtractor): formats = [] for format_id in QUALITIES: is_hd = format_id == 'hd' - format_url = playlist.get( - 'file%s' % ('_hd' if is_hd else '')) - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(playlist.get( + 'file%s' % ('_hd' if is_hd else ''))) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index f33fabe19..dff69fcb7 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, parse_duration, + url_or_none, ) @@ -50,8 +50,8 @@ class YouJizzIE(InfoExtractor): for encoding in encodings: if not isinstance(encoding, dict): continue - format_url = encoding.get('filename') - if not isinstance(format_url, compat_str): + format_url = url_or_none(encoding.get('filename')) + if not format_url: continue if determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 547adefeb..ea0bce784 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, sanitized_Request, str_to_int, unescapeHTML, unified_strdate, + url_or_none, ) from ..aes import aes_decrypt_text @@ -88,8 +88,8 @@ class YouPornIE(InfoExtractor): for definition in definitions: if not isinstance(definition, dict): continue - video_url = definition.get('videoUrl') - if isinstance(video_url, compat_str) and video_url: + video_url = url_or_none(definition.get('videoUrl')) + if video_url: links.append(video_url) # Fallback #1, this also contains extra low quality 180p format diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index b5a3a0716..fb167c198 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -13,6 +13,7 @@ from ..utils import ( ExtractorError, int_or_none, try_get, + url_or_none, urlencode_postdata, ) @@ -150,8 +151,8 @@ class ZattooBaseIE(InfoExtractor): for watch in watch_urls: if not isinstance(watch, dict): continue - watch_url = watch.get('url') - if not watch_url or not isinstance(watch_url, compat_str): + watch_url = url_or_none(watch.get('url')) + if not watch_url: continue format_id_list = [stream_type] maxrate = watch.get('maxrate') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index bb9020c91..afa3f6c47 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -15,6 +15,7 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, + url_or_none, urljoin, ) @@ -67,8 +68,8 @@ class ZDFIE(ZDFBaseIE): def _extract_subtitles(src): subtitles = {} for caption in try_get(src, lambda x: x['captions'], list) or []: - subtitle_url = caption.get('uri') - if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(caption.get('uri')) + if subtitle_url: lang = caption.get('language', 'deu') subtitles.setdefault(lang, []).append({ 'url': subtitle_url, @@ -76,8 +77,8 @@ class ZDFIE(ZDFBaseIE): return subtitles def _extract_format(self, video_id, formats, format_urls, meta): - format_url = meta.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(meta.get('url')) + if not format_url: return if format_url in format_urls: return @@ -152,7 +153,8 @@ class ZDFIE(ZDFBaseIE): content, lambda x: x['teaserImageRef']['layouts'], dict) if layouts: for layout_key, layout_url in layouts.items(): - if not isinstance(layout_url, compat_str): + layout_url = url_or_none(layout_url) + if not layout_url: continue thumbnail = { 'url': layout_url, From 6f27998e750d8409f03cce2754ea3e9066b3b794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 20:58:30 +0700 Subject: [PATCH 339/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1d602079e..bda7be4a9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version <unreleased> + +Core ++ [utils] Introduce url_or_none +* [utils] Allow JSONP without function name (#17028) ++ [extractor/common] Extract DASH and MSS formats from SMIL manifests + +Extractors ++ [bbc] Add support for BBC Radio Play pages (#17022) +* [iwara] Fix download URLs (#17026) +* [vrtnu] Relax title extraction and extract JSON-LD (#17018) ++ [viu] Pass Referer and Origin headers and area id (#16992) ++ [vimeo] Add another config regular expression (#17013) ++ [facebook] Extract view count (#16942) +* [dailymotion] Improve description extraction (#16984) +* [slutload] Fix and improve extraction (#17001) +* [mediaset] Fix extraction (#16977) ++ [theplatform] Add support for theplatform TLD customization (#16977) +* [imgur] Relax URL regular expression (#16987) +* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, + #16959) + + version 2018.07.10 Core From 8e66ffc3b7df8ad78e7f9e2e77d026c84027a814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Jul 2018 21:00:18 +0700 Subject: [PATCH 340/488] release 2018.07.21 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f192c6633..24827ba8f 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.21*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.21** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.07.10 +[debug] youtube-dl version 2018.07.21 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index bda7be4a9..94ecaae8e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.21 Core + [utils] Introduce url_or_none diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c7083cf47..9bf0ea30d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.10' +__version__ = '2018.07.21' From 6de82b44768624b888fd141e009540ff3bed9e6a Mon Sep 17 00:00:00 2001 From: Enes <enessolak99@gmail.com> Date: Tue, 24 Apr 2018 19:02:38 +0300 Subject: [PATCH 341/488] [puhutv] Add extractor (closes #16010) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/puhutv.py | 232 +++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 youtube_dl/extractor/puhutv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c6f8a785a..29fab5b9a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -860,6 +860,10 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) from .presstv import PressTVIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py new file mode 100644 index 000000000..8abdab52a --- /dev/null +++ b/youtube_dl/extractor/puhutv.py @@ -0,0 +1,232 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + determine_ext, + str_or_none, + url_or_none, + unified_strdate, + unified_timestamp, + try_get, + url_basename, + remove_end +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [ + { + # A Film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'a347470371d56e1585d1b2c8dab01c96', + 'info_dict': { + 'id': 'sut-kardesler', + 'display_id': '5085', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Arzu Film', + 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'uploader_id': '43', + 'upload_date': '20160729', + 'timestamp': int, + }, + }, + { + # An Episode and geo restricted + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, + { + # Has subtitle + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + } + ] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'https://puhutv.com/api/slug/%s-izle' % video_id, video_id)['data'] + + display_id = compat_str(info['id']) + title = info['title']['name'] + if info.get('display_name'): + title = '%s %s' % (title, info.get('display_name')) + + description = try_get(info, lambda x: x['title']['description'], compat_str) or info.get('description') + timestamp = unified_timestamp(info.get('created_at')) + upload_date = unified_strdate(info.get('created_at')) + uploader = try_get(info, lambda x: x['title']['producer']['name'], compat_str) + uploader_id = str_or_none(try_get(info, lambda x: x['title']['producer']['id'])) + view_count = int_or_none(try_get(info, lambda x: x['content']['watch_count'])) + duration = float_or_none(try_get(info, lambda x: x['content']['duration_in_ms']), scale=1000) + thumbnail = try_get(info, lambda x: x['content']['images']['wide']['main'], compat_str) + release_year = int_or_none(try_get(info, lambda x: x['title']['released_at'])) + webpage_url = info.get('web_url') + + season_number = int_or_none(info.get('season_number')) + season_id = int_or_none(info.get('season_id')) + episode_number = int_or_none(info.get('episode_number')) + + tags = [] + for tag in try_get(info, lambda x: x['title']['genres'], list) or []: + if isinstance(tag.get('name'), compat_str): + tags.append(tag.get('name')) + + thumbnails = [] + thumbs_dict = try_get(info, lambda x: x['content']['images']['wide'], dict) or {} + for id, url in thumbs_dict.items(): + if not url_or_none(url): + continue + thumbnails.append({ + 'url': 'https://%s' % url, + 'id': id + }) + + subtitles = {} + for subtitle in try_get(info, lambda x: x['content']['subtitles'], list) or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + # Some of videos are geo restricted upon request copyright owner and returns 403 + req_formats = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % display_id, + video_id, 'Downloading video JSON') + + formats = [] + for format in req_formats['data']['videos']: + media_url = url_or_none(format.get('url')) + if not media_url: + continue + ext = format.get('video_format') or determine_ext(media_url) + quality = format.get('quality') + if format.get('stream_type') == 'hls' and format.get('is_playlist') is True: + m3u8_id = remove_end(url_basename(media_url), '.m3u8') + formats.append(self._m3u8_meta_format(media_url, ext, m3u8_id=m3u8_id)) + elif ext == 'mp4' and format.get('is_playlist', False) is False: + formats.append({ + 'url': media_url, + 'format_id': 'http-%s' % quality, + 'ext': ext, + 'height': quality + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season_id': season_id, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_year': release_year, + 'upload_date': upload_date, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'duration': duration, + 'tags': tags, + 'subtitles': subtitles, + 'webpage_url': webpage_url, + 'thumbnail': thumbnail, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [ + { + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + 'uploader': 'Focus Film', + 'uploader_id': 61, + }, + 'playlist_mincount': 234, + }, + { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'info_dict': { + 'title': 'Kaybedenler Kulübü', + 'id': 'kaybedenler-kulubu', + 'uploader': 'Tolga Örnek, Murat Dörtbudak, Neslihan Dörtbudak, Kemal Kaplanoğlu', + 'uploader_id': 248, + }, + 'playlist_mincount': 1, + }, + ] + + def _extract_entries(self, playlist_id, seasons): + for season in seasons: + season_id = season['id'] + season_number = season.get('position') + pagenum = 1 + has_more = True + while has_more is True: + season_info = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + playlist_id, 'Downloading season %s page %s' % (season_number, pagenum), query={ + 'page': pagenum, + 'per': 40, + }) + for episode in season_info.get('episodes'): + video_id = episode['slugPath'].replace('-izle', '') + yield self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) + pagenum = pagenum + 1 + has_more = season_info.get('hasMore', False) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + 'https://puhutv.com/api/slug/%s-detay' % playlist_id, playlist_id)['data'] + + title = info.get('name') + uploader = try_get(info, lambda x: x['producer']['name'], compat_str) + uploader_id = try_get(info, lambda x: x['producer']['id']) + seasons = info.get('seasons') + if seasons: + entries = self._extract_entries(playlist_id, seasons) + else: + # For films, these are using same url with series + video_id = info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'entries': entries, + } From 8fd2a7be373a29b9bea491f952f14315a90c2f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 20:25:46 +0700 Subject: [PATCH 342/488] [puhutv] Improve extraction (closes #16269) --- youtube_dl/extractor/puhutv.py | 313 +++++++++++++++++---------------- 1 file changed, 164 insertions(+), 149 deletions(-) diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py index 8abdab52a..5465e8ab7 100644 --- a/youtube_dl/extractor/puhutv.py +++ b/youtube_dl/extractor/puhutv.py @@ -2,53 +2,54 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, int_or_none, float_or_none, - determine_ext, + parse_resolution, str_or_none, - url_or_none, - unified_strdate, - unified_timestamp, try_get, - url_basename, - remove_end + unified_timestamp, + url_or_none, + urljoin, ) class PuhuTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-izle' + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' IE_NAME = 'puhutv' - _TESTS = [ - { - # A Film - 'url': 'https://puhutv.com/sut-kardesler-izle', - 'md5': 'a347470371d56e1585d1b2c8dab01c96', - 'info_dict': { - 'id': 'sut-kardesler', - 'display_id': '5085', - 'ext': 'mp4', - 'title': 'Süt Kardeşler', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Arzu Film', - 'description': 'md5:405fd024df916ca16731114eb18e511a', - 'uploader_id': '43', - 'upload_date': '20160729', - 'timestamp': int, - }, + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1469778212, + 'upload_date': '20160729', + 'release_year': 1976, + 'view_count': int, + 'tags': ['Aile', 'Komedi', 'Klasikler'], }, - { - # An Episode and geo restricted - 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', - 'only_matching': True, - }, - { - # Has subtitle - 'url': 'https://puhutv.com/dip-1-bolum-izle', - 'only_matching': True, - } - ] + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] _SUBTITLE_LANGS = { 'English': 'en', 'Deutsch': 'de', @@ -56,47 +57,103 @@ class PuhuTVIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'https://puhutv.com/api/slug/%s-izle' % video_id, video_id)['data'] + display_id = self._match_id(url) - display_id = compat_str(info['id']) - title = info['title']['name'] + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + title = info.get('name') or info['title']['name'] if info.get('display_name'): title = '%s %s' % (title, info.get('display_name')) - description = try_get(info, lambda x: x['title']['description'], compat_str) or info.get('description') + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted() + raise + + formats = [] + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url: + continue + playlist = video.get('is_playlist') + if video.get('stream_type') == 'hls' and playlist is True: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + if video_format == 'hls' and playlist is False: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + self._sort_formats(formats) + + description = try_get( + info, lambda x: x['title']['description'], + compat_str) or info.get('description') timestamp = unified_timestamp(info.get('created_at')) - upload_date = unified_strdate(info.get('created_at')) - uploader = try_get(info, lambda x: x['title']['producer']['name'], compat_str) - uploader_id = str_or_none(try_get(info, lambda x: x['title']['producer']['id'])) - view_count = int_or_none(try_get(info, lambda x: x['content']['watch_count'])) - duration = float_or_none(try_get(info, lambda x: x['content']['duration_in_ms']), scale=1000) - thumbnail = try_get(info, lambda x: x['content']['images']['wide']['main'], compat_str) - release_year = int_or_none(try_get(info, lambda x: x['title']['released_at'])) - webpage_url = info.get('web_url') + creator = try_get( + info, lambda x: x['title']['producer']['name'], compat_str) + + duration = float_or_none( + try_get(info, lambda x: x['content']['duration_in_ms'], int), + scale=1000) + view_count = try_get(info, lambda x: x['content']['watch_count'], int) + + images = try_get( + info, lambda x: x['content']['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + release_year = try_get(info, lambda x: x['title']['released_at'], int) season_number = int_or_none(info.get('season_number')) - season_id = int_or_none(info.get('season_id')) + season_id = str_or_none(info.get('season_id')) episode_number = int_or_none(info.get('episode_number')) tags = [] - for tag in try_get(info, lambda x: x['title']['genres'], list) or []: - if isinstance(tag.get('name'), compat_str): - tags.append(tag.get('name')) - - thumbnails = [] - thumbs_dict = try_get(info, lambda x: x['content']['images']['wide'], dict) or {} - for id, url in thumbs_dict.items(): - if not url_or_none(url): + for genre in try_get(info, lambda x: x['title']['genres'], list) or []: + if not isinstance(genre, dict): continue - thumbnails.append({ - 'url': 'https://%s' % url, - 'id': id - }) + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) subtitles = {} - for subtitle in try_get(info, lambda x: x['content']['subtitles'], list) or []: + for subtitle in try_get( + info, lambda x: x['content']['subtitles'], list) or []: if not isinstance(subtitle, dict): continue lang = subtitle.get('language') @@ -107,30 +164,6 @@ class PuhuTVIE(InfoExtractor): 'url': sub_url }] - # Some of videos are geo restricted upon request copyright owner and returns 403 - req_formats = self._download_json( - 'https://puhutv.com/api/assets/%s/videos' % display_id, - video_id, 'Downloading video JSON') - - formats = [] - for format in req_formats['data']['videos']: - media_url = url_or_none(format.get('url')) - if not media_url: - continue - ext = format.get('video_format') or determine_ext(media_url) - quality = format.get('quality') - if format.get('stream_type') == 'hls' and format.get('is_playlist') is True: - m3u8_id = remove_end(url_basename(media_url), '.m3u8') - formats.append(self._m3u8_meta_format(media_url, ext, m3u8_id=m3u8_id)) - elif ext == 'mp4' and format.get('is_playlist', False) is False: - formats.append({ - 'url': media_url, - 'format_id': 'http-%s' % quality, - 'ext': ext, - 'height': quality - }) - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, @@ -140,93 +173,75 @@ class PuhuTVIE(InfoExtractor): 'season_number': season_number, 'episode_number': episode_number, 'release_year': release_year, - 'upload_date': upload_date, 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'creator': creator, 'view_count': view_count, 'duration': duration, 'tags': tags, 'subtitles': subtitles, - 'webpage_url': webpage_url, - 'thumbnail': thumbnail, 'thumbnails': thumbnails, 'formats': formats } class PuhuTVSerieIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[a-z0-9-]+)-detay' + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' IE_NAME = 'puhutv:serie' - _TESTS = [ - { - 'url': 'https://puhutv.com/deniz-yildizi-detay', - 'info_dict': { - 'title': 'Deniz Yıldızı', - 'id': 'deniz-yildizi', - 'uploader': 'Focus Film', - 'uploader_id': 61, - }, - 'playlist_mincount': 234, + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', }, - { - # a film detail page which is using same url with serie page - 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', - 'info_dict': { - 'title': 'Kaybedenler Kulübü', - 'id': 'kaybedenler-kulubu', - 'uploader': 'Tolga Örnek, Murat Dörtbudak, Neslihan Dörtbudak, Kemal Kaplanoğlu', - 'uploader_id': 248, - }, - 'playlist_mincount': 1, - }, - ] + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] - def _extract_entries(self, playlist_id, seasons): + def _extract_entries(self, seasons): for season in seasons: - season_id = season['id'] - season_number = season.get('position') - pagenum = 1 + season_id = season.get('id') + if not season_id: + continue + page = 1 has_more = True while has_more is True: - season_info = self._download_json( + season = self._download_json( 'https://galadriel.puhutv.com/seasons/%s' % season_id, - playlist_id, 'Downloading season %s page %s' % (season_number, pagenum), query={ - 'page': pagenum, + season_id, 'Downloading page %s' % page, query={ + 'page': page, 'per': 40, }) - for episode in season_info.get('episodes'): - video_id = episode['slugPath'].replace('-izle', '') - yield self.url_result( - 'https://puhutv.com/%s-izle' % video_id, - PuhuTVIE.ie_key(), video_id) - pagenum = pagenum + 1 - has_more = season_info.get('hasMore', False) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') def _real_extract(self, url): playlist_id = self._match_id(url) info = self._download_json( - 'https://puhutv.com/api/slug/%s-detay' % playlist_id, playlist_id)['data'] + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] - title = info.get('name') - uploader = try_get(info, lambda x: x['producer']['name'], compat_str) - uploader_id = try_get(info, lambda x: x['producer']['id']) seasons = info.get('seasons') if seasons: - entries = self._extract_entries(playlist_id, seasons) - else: - # For films, these are using same url with series - video_id = info['assets'][0]['slug'] - return self.url_result( - 'https://puhutv.com/%s-izle' % video_id, - PuhuTVIE.ie_key(), video_id) + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': title, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'entries': entries, - } + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) From a702056fbe0e4dc615ebc73d6891c505cd7d818f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:26:12 +0700 Subject: [PATCH 343/488] Credit @bastiandg for #16189 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index eaf96d79d..c9f7189ac 100644 --- a/AUTHORS +++ b/AUTHORS @@ -239,3 +239,4 @@ Martin Weinelt Surya Oktafendri TingPing Alexandre Macabies +Bastian de Groot From 7930f914949cbf28aa522463a22f0eea396875bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:27:28 +0700 Subject: [PATCH 344/488] Credit @haasn for #16326 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index c9f7189ac..9b9fff3c3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -240,3 +240,4 @@ Surya Oktafendri TingPing Alexandre Macabies Bastian de Groot +Niklas Haas From d94fb1225edc546f81f4b01b2eebcc9e4cd89313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:29:25 +0700 Subject: [PATCH 345/488] Credit @dnet for #16174 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 9b9fff3c3..0b23f4066 100644 --- a/AUTHORS +++ b/AUTHORS @@ -241,3 +241,4 @@ TingPing Alexandre Macabies Bastian de Groot Niklas Haas +András Veres-Szentkirályi From 694079dff754e72e8b3b6c1395f1e9b8e9a66db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:31:46 +0700 Subject: [PATCH 346/488] Credit @mrfade for #16269 and #16271 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 0b23f4066..af8accb51 100644 --- a/AUTHORS +++ b/AUTHORS @@ -242,3 +242,4 @@ Alexandre Macabies Bastian de Groot Niklas Haas András Veres-Szentkirályi +Enes Solak From a789d1cc90055aeefa4a2dafb4a81fcf43b65bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:34:34 +0700 Subject: [PATCH 347/488] Credit @nathanrossi for #16554 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index af8accb51..c2f48668b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -243,3 +243,4 @@ Bastian de Groot Niklas Haas András Veres-Szentkirályi Enes Solak +Nathan Rossi From 234a85858cd50d6c88848c2e0ceffeabe26306e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:35:38 +0700 Subject: [PATCH 348/488] Credit @tmsbrg for #15462 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index c2f48668b..58b695784 100644 --- a/AUTHORS +++ b/AUTHORS @@ -244,3 +244,4 @@ Niklas Haas András Veres-Szentkirályi Enes Solak Nathan Rossi +Thomas van der Berg From d4e7065111b1c18bc47795b102a4e5c6757e9bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Jul 2018 21:36:58 +0700 Subject: [PATCH 349/488] Credit @Kerruba for #16328 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 58b695784..b507cb8df 100644 --- a/AUTHORS +++ b/AUTHORS @@ -245,3 +245,4 @@ András Veres-Szentkirályi Enes Solak Nathan Rossi Thomas van der Berg +Luca Cherubin From 631f93ee2d5dfe5a90da38d293159670ada4d95e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 23 Jul 2018 06:20:00 +0100 Subject: [PATCH 350/488] [facebook] fix tahoe request for authenticated users(closes #16655) --- youtube_dl/extractor/facebook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f78479b92..97cfe0fc3 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -355,7 +355,6 @@ class FacebookIE(InfoExtractor): tahoe_data = self._download_webpage( self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, data=urlencode_postdata({ - '__user': 0, '__a': 1, '__pc': self._search_regex( r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, @@ -363,6 +362,9 @@ class FacebookIE(InfoExtractor): '__rev': self._search_regex( r'client_revision["\']\s*:\s*(\d+),', webpage, 'client revision', default='3944515'), + 'fb_dtsg': self._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + webpage, 'dtsg token', default=''), }), headers={ 'Content-Type': 'application/x-www-form-urlencoded', From b5dec62ca688f35feac1f56415355e9a8e850edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Jul 2018 23:07:12 +0700 Subject: [PATCH 351/488] [streamcloud] Fix extraction (closes #17054) --- youtube_dl/extractor/streamcloud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 6a6bb90c4..4a410611d 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -72,4 +72,7 @@ class StreamcloudIE(InfoExtractor): 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'http_headers': { + 'Referer': url, + }, } From ad1bc71a8a448583734b313bc77a2097200ad97b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 26 Jul 2018 07:24:46 +0100 Subject: [PATCH 352/488] [vk] fix extraction for inline only videos(fixes #16923) --- youtube_dl/extractor/vk.py | 40 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 48b5987c2..ef8b9bcb7 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -17,6 +17,7 @@ from ..utils import ( int_or_none, orderedSet, remove_start, + str_or_none, str_to_int, unescapeHTML, unified_timestamp, @@ -106,10 +107,10 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'uploader_id': '-77521', 'duration': 195, - 'timestamp': 1329060660, + 'timestamp': 1329049880, 'upload_date': '20120212', - 'view_count': int, }, }, { @@ -118,12 +119,12 @@ class VKIE(VKBaseIE): 'info_dict': { 'id': '165548505', 'ext': 'mp4', - 'uploader': 'Tom Cruise', 'title': 'No name', + 'uploader': 'Tom Cruise', + 'uploader_id': '205387401', 'duration': 9, - 'timestamp': 1374374880, - 'upload_date': '20130721', - 'view_count': int, + 'timestamp': 1374364108, + 'upload_date': '20130720', } }, { @@ -207,10 +208,10 @@ class VKIE(VKBaseIE): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:d9903938abdc74c738af77f527ca0596', - 'duration': 178, + 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'duration': 179, 'upload_date': '20130116', - 'uploader': "Children's Joy Foundation", + 'uploader': "Children's Joy Foundation Inc.", 'uploader_id': 'thecjf', 'view_count': int, }, @@ -222,6 +223,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', + # TODO: fix test by fixing dailymotion description extraction 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', @@ -241,9 +243,12 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', - 'timestamp': 1454870100, + 'uploader_id': '-110305615', + 'timestamp': 1454859345, 'upload_date': '20160207', - 'view_count': int, + }, + 'params': { + 'skip_download': True, }, }, { @@ -296,7 +301,7 @@ class VKIE(VKBaseIE): video_id = mobj.group('videoid') if video_id: - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: @@ -346,6 +351,9 @@ class VKIE(VKBaseIE): r'<!>This video is no longer available, because its author has been blocked.': 'Video %s is no longer available, because its author has been blocked.', + + r'<!>This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', } for error_re, error_msg in ERRORS.items(): @@ -394,7 +402,8 @@ class VKIE(VKBaseIE): if not data: data = self._parse_json( self._search_regex( - r'<!json>\s*({.+?})\s*<!>', info_page, 'json', default='{}'), + [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'], + info_page, 'json', default='{}'), video_id) if data: data = data['player']['params'][0] @@ -416,7 +425,7 @@ class VKIE(VKBaseIE): timestamp = unified_timestamp(self._html_search_regex( r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, - 'upload date', fatal=False)) + 'upload date', default=None)) or int_or_none(data.get('date')) view_count = str_to_int(self._search_regex( r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', @@ -454,9 +463,12 @@ class VKIE(VKBaseIE): 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'uploader_id': str_or_none(data.get('author_id')), 'duration': data.get('duration'), 'timestamp': timestamp, 'view_count': view_count, + 'like_count': int_or_none(data.get('liked')), + 'dislike_count': int_or_none(data.get('nolikes')), 'is_live': is_live, } From 0c7b4f49eb07bb68918da3dd7ff277565273033f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 26 Jul 2018 08:11:06 +0100 Subject: [PATCH 353/488] [rai] return non http relinker URL intact(closes #17055) --- youtube_dl/extractor/rai.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index d22311031..f916b2619 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -32,6 +32,9 @@ class RaiBaseIE(InfoExtractor): _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + formats = [] geoprotection = None is_live = None @@ -369,6 +372,10 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): From 722f1a0f8f82617b3abf646b2d0df2c624e98912 Mon Sep 17 00:00:00 2001 From: Sidney de Koning <sidney.dekoning@gmail.com> Date: Fri, 27 Jul 2018 19:18:41 +0200 Subject: [PATCH 354/488] [README.md] Actualize Firefox cookie export add-on Previous one does not work with newer Firefox versions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6d49d6a4f..dd068a462 100644 --- a/README.md +++ b/README.md @@ -870,7 +870,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. From 8e37a7e4cce7555c12b8d9f7a1d331476aba357c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 28 Jul 2018 06:52:36 +0100 Subject: [PATCH 355/488] [mitele] reduce number of requests and update tests --- youtube_dl/extractor/mitele.py | 109 +++------------------------------ 1 file changed, 10 insertions(+), 99 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 42759eae8..40f214a87 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,84 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import uuid - from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( int_or_none, - extract_attributes, - determine_ext, smuggle_url, parse_duration, ) -class MiTeleBaseIE(InfoExtractor): - def _get_player_info(self, url, webpage): - player_data = extract_attributes(self._search_regex( - r'(?s)(<ms-video-player.+?</ms-video-player>)', - webpage, 'ms video player')) - video_id = player_data['data-media-id'] - if player_data.get('data-cms-id') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id) - config_url = compat_urlparse.urljoin(url, player_data['data-config']) - config = self._download_json( - config_url, video_id, 'Downloading config JSON') - mmc_url = config['services']['mmc'] - - duration = None - formats = [] - for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): - mmc = self._download_json( - m_url, video_id, 'Downloading mmc JSON') - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, - } - - class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player' @@ -86,7 +16,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'info_dict': { - 'id': '57b0dfb9c715da65618b4afa', + 'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg', 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', @@ -104,7 +34,7 @@ class MiTeleIE(InfoExtractor): # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', 'info_dict': { - 'id': '57b0de3dc915da14058b4876', + 'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq', 'ext': 'mp4', 'title': 'Cuarto Milenio Temporada 6 Programa 226', 'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f', @@ -128,40 +58,21 @@ class MiTeleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - gigya_url = self._search_regex( - r'<gigya-api>[^>]*</gigya-api>[^>]*<script\s+src="([^"]*)">[^>]*</script>', - webpage, 'gigya', default=None) - gigya_sc = self._download_webpage( - compat_urlparse.urljoin('http://www.mitele.es/', gigya_url), - video_id, 'Downloading gigya script') - - # Get a appKey/uuid for getting the session key - appKey = self._search_regex( - r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', - gigya_sc, 'appKey') - - session_json = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/session', - video_id, 'Downloading session keys', query={ - 'appKey': appKey, - 'uuid': compat_str(uuid.uuid4()), - }) paths = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration', - video_id, 'Downloading paths JSON', - query={'sessionKey': compat_str(session_json['sessionKey'])}) + 'https://www.mitele.es/amd/agp/web/metadata/general_configuration', + video_id, 'Downloading paths JSON') ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search'] + base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com') + full_path = ooyala_s.get('full_path', '/search/v1/full/providers/') source = self._download_json( - 'http://%s%s%s/docs/%s' % ( - ooyala_s['base_url'], ooyala_s['full_path'], - ooyala_s['provider_id'], video_id), + '%s://%s%s%s/docs/%s' % ( + ooyala_s.get('protocol', 'https'), base_url, full_path, + ooyala_s.get('provider_id', '104951'), video_id), video_id, 'Downloading data JSON', query={ 'include_titles': 'Series,Season', - 'product_name': 'test', + 'product_name': ooyala_s.get('product_name', 'test'), 'format': 'full', })['hits']['hits'][0]['_source'] From a098c99f0d0deb4ad0d5c9b67496582d89970368 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 28 Jul 2018 06:55:18 +0100 Subject: [PATCH 356/488] [telecinco] fix extraction(closes #17080) --- youtube_dl/extractor/telecinco.py | 122 ++++++++++++++++++++++++++---- 1 file changed, 106 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index fdcc7d573..d37e1b055 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,26 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals -from .mitele import MiTeleBaseIE +import json +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + str_or_none, + urljoin, +) -class TelecincoIE(MiTeleBaseIE): +class TelecincoIE(InfoExtractor): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', - 'md5': '8d7b2d5f699ee2709d992a63d5cd1712', 'info_dict': { - 'id': 'JEA5ijCnF6p5W08A1rNKn7', - 'ext': 'mp4', + 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', - 'duration': 662, }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'JEA5ijCnF6p5W08A1rNKn7', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido', + 'duration': 662, + }, + }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '284393e5387b3b947b77c613ef04749a', + 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -30,7 +47,7 @@ class TelecincoIE(MiTeleBaseIE): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': '749afab6ea5a136a8806855166ae46a2', + 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -50,17 +67,90 @@ class TelecincoIE(MiTeleBaseIE): 'only_matching': True, }] + def _parse_content(self, content, url): + video_id = content['dataMediaId'] + if content.get('dataCmsId') == 'ooyala': + return self.url_result( + 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) + config_url = urljoin(url, content['dataConfig']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + title = config['info']['title'] + + def mmc_url(mmc_type): + return re.sub( + r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, + config['services']['mmc']) + + duration = None + formats = [] + for mmc_type in ('flash', 'html5'): + mmc = self._download_json( + mmc_url(mmc_type), video_id, + 'Downloading %s mmc JSON' % mmc_type, fatal=False) + if not mmc: + continue + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + gcp = location.get('gcp') + ogn = location.get('ogn') + if None in (gat, gcp, ogn): + continue + token_data = { + 'gcp': gcp, + 'ogn': ogn, + 'sta': 0, + } + media = self._download_json( + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }, fatal=False) or {} + stream = media.get('stream') or media.get('file') + if not stream: + continue + ext = determine_ext(stream) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, + } + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title') - info = self._get_player_info(url, webpage) + article = self._parse_json(self._search_regex( + r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + webpage, 'article'), display_id)['article'] + title = article.get('title') + description = clean_html(article.get('leadParagraph')) + if article.get('editorialType') != 'VID': + entries = [] + for p in article.get('body', []): + content = p.get('content') + if p.get('type') != 'video' or not content: + continue + entries.append(self._parse_content(content, url)) + return self.playlist_result( + entries, str_or_none(article.get('id')), title, description) + content = article['opening']['content'] + info = self._parse_content(content, url) info.update({ - 'display_id': display_id, - 'title': title, - 'description': self._html_search_meta( - ['og:description', 'twitter:description'], - webpage, 'title', fatal=False), + 'description': description, }) return info From 9a984265b90eb0e8a7c26a1edf479fbfcebce0a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Jul 2018 21:26:23 +0700 Subject: [PATCH 357/488] [ted] Fix extraction for videos without nativeDownloads (closes #16756, closes #17085) --- youtube_dl/extractor/ted.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 06a27fd04..dc9c5ce8e 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -107,6 +107,19 @@ class TEDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # no nativeDownloads + 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', + 'info_dict': { + 'id': '1792', + 'ext': 'mp4', + 'title': 'The orchestra in my mouth', + 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', + 'uploader': 'Tom Thum', + }, + 'params': { + 'skip_download': True, + }, }] _NATIVE_FORMATS = { @@ -180,8 +193,10 @@ class TEDIE(InfoExtractor): } native_downloads = try_get( - talk_info, lambda x: x['downloads']['nativeDownloads'], - dict) or talk_info['nativeDownloads'] + talk_info, + (lambda x: x['downloads']['nativeDownloads'], + lambda x: x['nativeDownloads']), + dict) or {} formats = [{ 'url': format_url, From cd3a3ff93bd5d6866d3822cb438b0e172ffe4e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Jul 2018 22:09:53 +0700 Subject: [PATCH 358/488] [ted] Improve extraction and update tests --- youtube_dl/extractor/ted.py | 111 ++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index dc9c5ce8e..212ac80ab 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -7,8 +7,10 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + float_or_none, int_or_none, try_get, + url_or_none, ) @@ -30,7 +32,7 @@ class TEDIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': '0de43ac406aa3e4ea74b66c9c7789b13', + 'md5': 'b0ce2b05ca215042124fbc9e3886493a', 'info_dict': { 'id': '102', 'ext': 'mp4', @@ -42,24 +44,30 @@ class TEDIE(InfoExtractor): 'uploader': 'Dan Dennett', 'width': 853, 'duration': 1308, - } - }, { - 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', - 'md5': 'b899ac15e345fb39534d913f7606082b', - 'info_dict': { - 'id': 'tSVI8ta_P4w', - 'ext': 'mp4', - 'title': 'Vishal Sikka: The beauty and power of algorithms', - 'thumbnail': r're:^https?://.+\.jpg', - 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', - 'upload_date': '20140122', - 'uploader_id': 'TEDInstitute', - 'uploader': 'TED Institute', + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # missing HTTP bitrates + 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', + 'info_dict': { + 'id': '6069', + 'ext': 'mp4', + 'title': 'The beauty and power of algorithms', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:734e352710fb00d840ab87ae31aaf688', + 'uploader': 'Vishal Sikka', + }, + 'params': { + 'skip_download': True, }, - 'add_ie': ['Youtube'], }, { 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', - 'md5': '71b3ab2f4233012dce09d515c9c39ce2', + 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 'info_dict': { 'id': '1972', 'ext': 'mp4', @@ -68,6 +76,9 @@ class TEDIE(InfoExtractor): 'description': 'md5:5174aed4d0f16021b704120360f72b92', 'duration': 1128, }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.ted.com/playlists/who_are_the_hackers', 'info_dict': { @@ -91,22 +102,6 @@ class TEDIE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - # YouTube video - 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': 'aFBIPO-P7LM', - 'ext': 'mp4', - 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', - 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', - 'uploader': 'TEDx Talks', - 'uploader_id': 'TEDxTalks', - 'upload_date': '20111216', - }, - 'params': { - 'skip_download': True, - }, }, { # no nativeDownloads 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', @@ -116,6 +111,9 @@ class TEDIE(InfoExtractor): 'title': 'The orchestra in my mouth', 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', 'uploader': 'Tom Thum', + 'view_count': int, + 'comment_count': int, + 'tags': list, }, 'params': { 'skip_download': True, @@ -174,24 +172,11 @@ class TEDIE(InfoExtractor): info = self._extract_info(webpage) - talk_info = try_get( - info, lambda x: x['__INITIAL_DATA__']['talks'][0], - dict) or info['talks'][0] + data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info + talk_info = data['talks'][0] title = talk_info['title'].strip() - external = talk_info.get('external') - if external: - service = external['service'] - self.to_screen('Found video from %s' % service) - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } - native_downloads = try_get( talk_info, (lambda x: x['downloads']['nativeDownloads'], @@ -211,10 +196,24 @@ class TEDIE(InfoExtractor): player_talk = talk_info['player_talks'][0] + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + return { + '_type': 'url', + 'url': ext_url or external['uri'], + } + resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None for format_id, resources in resources_.items(): + if not isinstance(resources, dict): + continue if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -243,8 +242,12 @@ class TEDIE(InfoExtractor): 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': + stream_url = url_or_none(resources.get('stream')) + if not stream_url: + continue formats.extend(self._extract_m3u8_formats( - resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) + stream_url, video_name, 'mp4', m3u8_id=format_id, + fatal=False)) m3u8_formats = list(filter( lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', @@ -254,9 +257,13 @@ class TEDIE(InfoExtractor): bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) if not bitrate: continue + bitrate_url = re.sub(r'\d+k', bitrate, http_url) + if not self._is_valid_url( + bitrate_url, video_name, '%s bitrate' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k', bitrate, http_url), + 'url': bitrate_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) @@ -282,7 +289,11 @@ class TEDIE(InfoExtractor): 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, - 'duration': talk_info.get('duration'), + 'duration': float_or_none(talk_info.get('duration')), + 'view_count': int_or_none(data.get('viewed_count')), + 'comment_count': int_or_none( + try_get(data, lambda x: x['comments']['count'])), + 'tags': try_get(talk_info, lambda x: x['tags'], list), } def _get_subtitles(self, video_id, talk_info): From ec240a43696478e43abb15e7c91f067b2bd5fe08 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 28 Jul 2018 20:29:56 +0100 Subject: [PATCH 359/488] [dailymotion:playlist] fix extraction(closes #16894) --- youtube_dl/extractor/dailymotion.py | 126 ++++++++++++++++++---------- 1 file changed, 84 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 8f5f57b98..040f0bd02 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 +import functools import hashlib import itertools import json @@ -16,11 +17,13 @@ from ..utils import ( error_to_compat_str, ExtractorError, int_or_none, + mimetype2ext, + OnDemandPagedList, parse_iso8601, sanitized_Request, str_to_int, unescapeHTML, - mimetype2ext, + urlencode_postdata, ) @@ -343,17 +346,93 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' - _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' - _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'info_dict': { 'title': 'SPORT', - 'id': 'xv4bw_nqtv_sport', + 'id': 'xv4bw', }, 'playlist_mincount': 20, }] + _PAGE_SIZE = 100 + + def _fetch_page(self, playlist_id, authorizaion, page): + page += 1 + videos = self._download_json( + 'https://graphql.api.dailymotion.com', + playlist_id, 'Downloading page %d' % page, + data=json.dumps({ + 'query': '''{ + collection(xid: "%s") { + videos(first: %d, page: %d) { + pageInfo { + hasNextPage + nextPage + } + edges { + node { + xid + url + } + } + } + } +}''' % (playlist_id, self._PAGE_SIZE, page) + }).encode(), headers={ + 'Authorization': authorizaion, + 'Origin': 'https://www.dailymotion.com', + })['data']['collection']['videos'] + for edge in videos['edges']: + node = edge['node'] + yield self.url_result( + node['url'], DailymotionIE.ie_key(), node['xid']) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + api = self._parse_json(self._search_regex( + r'__PLAYER_CONFIG__\s*=\s*({.+?});', + webpage, 'player config'), playlist_id)['context']['api'] + auth = self._download_json( + api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), + playlist_id, data=urlencode_postdata({ + 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), + 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), + 'grant_type': 'client_credentials', + })) + authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, + self._og_search_title(webpage)) + + +class DailymotionUserIE(DailymotionBaseInfoExtractor): + IE_NAME = 'dailymotion:user' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' + _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' + _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' + _TESTS = [{ + 'url': 'https://www.dailymotion.com/user/nqtv', + 'info_dict': { + 'id': 'nqtv', + 'title': 'Rémi Gaillard', + }, + 'playlist_mincount': 100, + }, { + 'url': 'http://www.dailymotion.com/user/UnderProject', + 'info_dict': { + 'id': 'UnderProject', + 'title': 'UnderProject', + }, + 'playlist_mincount': 1800, + 'expected_warnings': [ + 'Stopped at duplicated page', + ], + 'skip': 'Takes too long time', + }] def _extract_entries(self, id): video_ids = set() @@ -379,43 +458,6 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - webpage = self._download_webpage(url, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._og_search_title(webpage), - 'entries': self._extract_entries(playlist_id), - } - - -class DailymotionUserIE(DailymotionPlaylistIE): - IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' - _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' - _TESTS = [{ - 'url': 'https://www.dailymotion.com/user/nqtv', - 'info_dict': { - 'id': 'nqtv', - 'title': 'Rémi Gaillard', - }, - 'playlist_mincount': 100, - }, { - 'url': 'http://www.dailymotion.com/user/UnderProject', - 'info_dict': { - 'id': 'UnderProject', - 'title': 'UnderProject', - }, - 'playlist_mincount': 1800, - 'expected_warnings': [ - 'Stopped at duplicated page', - ], - 'skip': 'Takes too long time', - }] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') From 38e87f6c2ae2cdc04dd6f526213c83a0259db335 Mon Sep 17 00:00:00 2001 From: Huyuumi <zx.you.funy@gmail.com> Date: Sun, 29 Jul 2018 07:52:42 +0900 Subject: [PATCH 360/488] [utils] Remove return from __init__ --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b84436ed6..29cafd8f0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3569,7 +3569,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - return compat_urllib_request.ProxyHandler.__init__(self, proxies) + compat_urllib_request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') From 1a88fc5a69249a6c36ce5dbb6e5d251792ab6f39 Mon Sep 17 00:00:00 2001 From: bato3 <bato3@bandyci.org> Date: Sun, 29 Jul 2018 01:04:59 +0200 Subject: [PATCH 361/488] [ceskatelevize] Use https for API call (refs #16997) --- youtube_dl/extractor/ceskatelevize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6bad90859..46380430f 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -108,7 +108,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') From 4938c8d5730fd0db9eee977f94052c0058e407f0 Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano <gfabiano40@gmail.com> Date: Sun, 29 Jul 2018 01:24:10 +0200 Subject: [PATCH 362/488] [pornhub] Add support for subtitles (closes #16924) --- youtube_dl/extractor/pornhub.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 97f988da4..ffc4405a8 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -18,6 +18,7 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + url_or_none, ) @@ -68,6 +69,31 @@ class PornHubIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -139,12 +165,19 @@ class PornHubIE(InfoExtractor): video_urls = [] video_urls_set = set() + subtitles = {} flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) media_definitions = flashvars.get('mediaDefinitions') @@ -256,6 +289,7 @@ class PornHubIE(InfoExtractor): 'age_limit': 18, 'tags': tags, 'categories': categories, + 'subtitles': subtitles, } From b2286f8fb29a7be8eefe53e074df8ab1092a12d8 Mon Sep 17 00:00:00 2001 From: bato3 <bato3@bandyci.org> Date: Sun, 29 Jul 2018 01:56:52 +0200 Subject: [PATCH 363/488] [crunchyroll:playlist] Restrict _VALID_URL (closes #17069) --- youtube_dl/extractor/crunchyroll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 311da515d..463f995c7 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -262,6 +262,9 @@ class CrunchyrollIE(CrunchyrollBaseIE): # Just test metadata extraction 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/media-723735', + 'only_matching': True, }] _FORMAT_IDS = { @@ -580,7 +583,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', From 4eecef84f32869c25d56a4297a09b1b0b14a403e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Jul 2018 06:58:37 +0700 Subject: [PATCH 364/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 94ecaae8e..f9dd0a89f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Extractors +* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) ++ [pornhub] Add support for subtitles (#16924, #17088) +* [ceskatelevize] Use https for API call (#16997, #16999) +* [dailymotion:playlist] Fix extraction (#16894) +* [ted] Improve extraction +* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) +* [telecinco] Fix extraction (#17080) +* [mitele] Reduce number of requests +* [rai] Return non HTTP relinker URL intact (#17055) +* [vk] Fix extraction for inline only videos (#16923) +* [streamcloud] Fix extraction (#17054) +* [facebook] Fix tahoe player extraction with authentication (#16655) ++ [puhutv] Add support for puhutv.com (#12712, #16010, #16269) + + version 2018.07.21 Core From 548482841867a16d3f68e18f78091e59f768a880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Jul 2018 07:02:18 +0700 Subject: [PATCH 365/488] release 2018.07.29 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 24827ba8f..cae5fd749 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.21*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.21** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.29** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.07.21 +[debug] youtube-dl version 2018.07.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index f9dd0a89f..dfa27d3be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.07.29 Extractors * [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6cbe81802..f464d89db 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -672,6 +672,8 @@ - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital + - **puhutv** + - **puhutv:serie** - **Puls4** - **Pyvideo** - **qqmusic**: QQ音乐 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9bf0ea30d..2048b69d2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.21' +__version__ = '2018.07.29' From 9d1b213845f35af4de40dd057754f8f285091bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 30 Jul 2018 03:05:36 +0700 Subject: [PATCH 366/488] [viqeo] Add extractor (closes #17066) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 15 +++++ youtube_dl/extractor/viqeo.py | 99 ++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 youtube_dl/extractor/viqeo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 29fab5b9a..c7a91a986 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1291,6 +1291,7 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .viqeo import ViqeoIE from .viu import ( ViuIE, ViuPlaylistIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e5a8ffbe8..43218c3a4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -113,6 +113,7 @@ from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE from .apa import APAIE from .foxnews import FoxNewsIE +from .viqeo import ViqeoIE class GenericIE(InfoExtractor): @@ -2060,6 +2061,15 @@ class GenericIE(InfoExtractor): }, 'skip': 'TODO: fix nested playlists processing in tests', }, + { + # Viqeo embeds + 'url': 'https://viqeo.tv/', + 'info_dict': { + 'id': 'viqeo', + 'title': 'All-new video platform', + }, + 'playlist_count': 6, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -3094,6 +3104,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( sharevideos_urls, video_id, video_title) + viqeo_urls = ViqeoIE._extract_urls(webpage) + if viqeo_urls: + return self.playlist_from_matches( + viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/extractor/viqeo.py b/youtube_dl/extractor/viqeo.py new file mode 100644 index 000000000..be7dfa814 --- /dev/null +++ b/youtube_dl/extractor/viqeo.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, + url_or_none, +) + + +class ViqeoIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + viqeo:| + https?://cdn\.viqeo\.tv/embed/*\?.*?\bvid=| + https?://api\.viqeo\.tv/v\d+/data/startup?.*?\bvideo(?:%5B%5D|\[\])= + ) + (?P<id>[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837', + 'md5': 'a169dd1a6426b350dca4296226f21e76', + 'info_dict': { + 'id': 'cde96f09d25f39bee837', + 'ext': 'mp4', + 'title': 'cde96f09d25f39bee837', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 76, + }, + }, { + 'url': 'viqeo:cde96f09d25f39bee837', + 'only_matching': True, + }, { + 'url': 'https://api.viqeo.tv/v1/data/startup?video%5B%5D=71bbec412ade45c3216c&profile=112', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://cdn.viqeo.tv/embed/?vid=%s' % video_id, video_id) + + data = self._parse_json( + self._search_regex( + r'SLOT_DATA\s*=\s*({.+?})\s*;', webpage, 'slot data'), + video_id) + + formats = [] + thumbnails = [] + for media_file in data['mediaFiles']: + if not isinstance(media_file, dict): + continue + media_url = url_or_none(media_file.get('url')) + if not media_url or not media_url.startswith(('http', '//')): + continue + media_type = str_or_none(media_file.get('type')) + if not media_type: + continue + media_kind = media_type.split('/')[0].lower() + f = { + 'url': media_url, + 'width': int_or_none(media_file.get('width')), + 'height': int_or_none(media_file.get('height')), + } + format_id = str_or_none(media_file.get('quality')) + if media_kind == 'image': + f['id'] = format_id + thumbnails.append(f) + elif media_kind in ('video', 'audio'): + is_audio = media_kind == 'audio' + f.update({ + 'format_id': 'audio' if is_audio else format_id, + 'fps': int_or_none(media_file.get('fps')), + 'vcodec': 'none' if is_audio else None, + }) + formats.append(f) + self._sort_formats(formats) + + duration = int_or_none(data.get('duration')) + + return { + 'id': video_id, + 'title': video_id, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } From 7ff129d3ea6ec010493a3b98d960e943eda05595 Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano <gfabiano40@gmail.com> Date: Sun, 29 Jul 2018 22:15:06 +0200 Subject: [PATCH 367/488] [theplatform] Relax _VALID_URL (closes #16181) --- youtube_dl/extractor/theplatform.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 411b1f874..ffef5bf06 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -310,7 +310,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): class ThePlatformFeedIE(ThePlatformBaseIE): _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' - _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))' _TESTS = [{ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', @@ -327,6 +327,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], 'uploader': 'NBCU-NEWS', }, + }, { + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01', + 'only_matching': True, }] def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): From 6f2d82a5a0fd020ac36bdfe4faa0ee8a9f4945ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 30 Jul 2018 23:10:40 +0700 Subject: [PATCH 368/488] [pbs] Fix extraction (closes #17109) --- youtube_dl/extractor/pbs.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 52ab2f158..80340f595 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -15,6 +15,7 @@ from ..utils import ( strip_jsonp, strip_or_none, unified_strdate, + url_or_none, US_RATINGS, ) @@ -557,6 +558,13 @@ class PBSIE(InfoExtractor): if redirect_url and redirect_url not in redirect_urls: redirects.append(redirect) redirect_urls.add(redirect_url) + encodings = info.get('encodings') + if isinstance(encodings, list): + for encoding in encodings: + encoding_url = url_or_none(encoding) + if encoding_url and encoding_url not in redirect_urls: + redirects.append({'url': encoding_url}) + redirect_urls.add(encoding_url) chapters = [] # Player pages may also serve different qualities From 19b9de13c4e83416f09708dddecba6edc69b4525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 30 Jul 2018 23:28:44 +0700 Subject: [PATCH 369/488] [watchbox] Fix extraction (closes #17107) --- youtube_dl/extractor/watchbox.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index d99313080..5a4e46e73 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -10,6 +10,7 @@ from ..utils import ( js_to_json, strip_or_none, try_get, + unescapeHTML, unified_timestamp, ) @@ -67,12 +68,20 @@ class WatchBoxIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - source = (self._parse_json( + player_config = self._parse_json( self._search_regex( - r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', - default='{}'), - video_id, transform_source=js_to_json, - fatal=False) or {}).get('source') or {} + r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage, + 'player config', default='{}', group='data'), + video_id, transform_source=unescapeHTML, fatal=False) + + if not player_config: + player_config = self._parse_json( + self._search_regex( + r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) or {} + + source = player_config.get('source') or {} video_id = compat_str(source.get('videoId') or video_id) From 644921b3722093720216ba62f09db5a75a0b43e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 2 Aug 2018 23:16:15 +0700 Subject: [PATCH 370/488] [twitch:vod] Improve _VALID_URL (closes #17135) --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 89ee44224..2c9363ffb 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -240,7 +240,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -296,6 +296,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/northernlion/video/291940395', + 'only_matching': True, }] def _real_extract(self, url): From 48afc6ca3ea8b666b0af24b0d9f424917d134d1b Mon Sep 17 00:00:00 2001 From: Tim Broder <tim@kidfund.us> Date: Tue, 31 Jul 2018 16:49:30 -0400 Subject: [PATCH 371/488] [twitch] Fix authentication (closes #17024) --- youtube_dl/extractor/twitch.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 2c9363ffb..cbb735403 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import itertools import re import random +import json from .common import InfoExtractor from ..compat import ( @@ -26,7 +27,6 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, - urlencode_postdata, url_or_none, urljoin, ) @@ -37,7 +37,8 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' - _LOGIN_URL = 'https://www.twitch.tv/login' + _LOGIN_FORM_URL = 'https://www.twitch.tv/login' + _LOGIN_POST_URL = 'https://passport.twitch.tv/login' _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6' _NETRC_MACHINE = 'twitch' @@ -77,21 +78,25 @@ class TwitchBaseIE(InfoExtractor): page_url = urlh.geturl() post_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, - 'post url', default=page_url, group='url') + 'post url', default=self._LOGIN_POST_URL, group='url') post_url = urljoin(page_url, post_url) - headers = {'Referer': page_url} + headers = { + 'Referer': page_url, + 'Origin': page_url, + 'Content-Type': 'text/plain;charset=UTF-8' + } try: response = self._download_json( post_url, None, note, - data=urlencode_postdata(form), + data=json.dumps(form).encode(), headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: response = self._parse_json( e.cause.read().decode('utf-8'), None) - fail(response.get('message') or response['errors'][0]) + fail(response.get('error_description') or response.get('error_code')) raise if 'Authenticated successfully' in response.get('message', ''): @@ -105,7 +110,7 @@ class TwitchBaseIE(InfoExtractor): headers=headers) login_page, handle = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login page') + self._LOGIN_FORM_URL, None, 'Downloading login page') # Some TOR nodes and public proxies are blocked completely if 'blacklist_message' in login_page: @@ -115,6 +120,7 @@ class TwitchBaseIE(InfoExtractor): login_page, handle, 'Logging in', { 'username': username, 'password': password, + 'client_id': self._CLIENT_ID, }) # Successful login From cb1c3a3c07b2afe854ecd8c97f77d8f62720d7a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Aug 2018 22:43:06 +0700 Subject: [PATCH 372/488] [twitch] Update cliend id and modernize (closes #17126) --- youtube_dl/extractor/twitch.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index cbb735403..b39972b1e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -8,7 +8,6 @@ import json from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_kwargs, compat_parse_qs, compat_str, @@ -39,7 +38,7 @@ class TwitchBaseIE(InfoExtractor): _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_FORM_URL = 'https://www.twitch.tv/login' _LOGIN_POST_URL = 'https://passport.twitch.tv/login' - _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6' + _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -84,20 +83,15 @@ class TwitchBaseIE(InfoExtractor): headers = { 'Referer': page_url, 'Origin': page_url, - 'Content-Type': 'text/plain;charset=UTF-8' + 'Content-Type': 'text/plain;charset=UTF-8', } - try: - response = self._download_json( - post_url, None, note, - data=json.dumps(form).encode(), - headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - response = self._parse_json( - e.cause.read().decode('utf-8'), None) - fail(response.get('error_description') or response.get('error_code')) - raise + response = self._download_json( + post_url, None, note, data=json.dumps(form).encode(), + headers=headers, expected_status=400) + error = response.get('error_description') or response.get('error_code') + if error: + fail(error) if 'Authenticated successfully' in response.get('message', ''): return None, None From af322eb830525edaf39a202b7ecaface2bfc1bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Aug 2018 00:26:58 +0700 Subject: [PATCH 373/488] [funk:channel] Improve byChannelAlias extraction (closes #17142) --- youtube_dl/extractor/funk.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index 76c20ffac..7e1af95e0 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -125,17 +126,31 @@ class FunkChannelIE(FunkBaseIE): # Id-based channels are currently broken on their side: webplayer # tries to process them via byChannelAlias endpoint and fails # predictably. - by_channel_alias = self._download_json( - 'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s' - % channel_id, - 'Downloading byChannelAlias JSON', headers=headers, query={ - 'size': 100, - }, fatal=False) - if by_channel_alias: + for page_num in itertools.count(): + by_channel_alias = self._download_json( + 'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s' + % channel_id, + 'Downloading byChannelAlias JSON page %d' % (page_num + 1), + headers=headers, query={ + 'filterFsk': 'false', + 'sort': 'creationDate,desc', + 'size': 100, + 'page': page_num, + }, fatal=False) + if not by_channel_alias: + break video_list = try_get( by_channel_alias, lambda x: x['_embedded']['videoList'], list) - if video_list: + if not video_list: + break + try: video = next(r for r in video_list if r.get('alias') == alias) + break + except StopIteration: + pass + if not try_get( + by_channel_alias, lambda x: x['_links']['next']): + break if not video: by_id_list = self._download_json( From 20f96f64bd1c5825604995800fde741c5581fd83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Aug 2018 01:20:55 +0700 Subject: [PATCH 374/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index dfa27d3be..ca18e2a7d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +version <unreleased> + +Extractors +* [funk:channel] Improve byChannelAlias extraction (#17142) +* [twitch] Fix authentication (#17024, #17126) +* [twitch:vod] Improve URL regular expression (#17135) +* [watchbox] Fix extraction (#17107) +* [pbs] Fix extraction (#17109) +* [theplatform] Relax URL regular expression (#16181, #17097) ++ [viqeo] Add support for viqeo.tv (#17066) + + version 2018.07.29 Extractors From 81cc22bab66e27e64039263fe2cdbc4b8dfb41e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Aug 2018 01:23:24 +0700 Subject: [PATCH 375/488] release 2018.08.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index cae5fd749..20433e915 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.29** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.08.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.08.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.07.29 +[debug] youtube-dl version 2018.08.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ca18e2a7d..665503827 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.08.04 Extractors * [funk:channel] Improve byChannelAlias extraction (#17142) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f464d89db..4bf2ec81b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1001,6 +1001,7 @@ - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** + - **Viqeo** - **Viu** - **viu:ott** - **viu:playlist** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2048b69d2..67394fa01 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.29' +__version__ = '2018.08.04' From d588d4a5a69a1dee1a52ceb60e963687176d0493 Mon Sep 17 00:00:00 2001 From: Stanny Nuytkens <stannynuytkens@users.noreply.github.com> Date: Sun, 5 Aug 2018 17:10:01 +0200 Subject: [PATCH 376/488] [.gitignore] Add .vscode --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index f064a0d9e..c4870a6ba 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ youtube-dl.zsh tmp/ venv/ + +# VS Code related files +.vscode From a62460aa213c628874a4e6aee9d835433c705b94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 6 Aug 2018 04:37:03 +0700 Subject: [PATCH 377/488] [imdb] Fix extension extraction (closes #17167) --- youtube_dl/extractor/imdb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index fba01ef49..436759da5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -64,7 +64,8 @@ class ImdbIE(InfoExtractor): video_url = url_or_none(encoding.get('videoUrl')) if not video_url: continue - ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) + ext = mimetype2ext(encoding.get( + 'mimeType')) or determine_ext(video_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', From d37dc6e1c956ce0f390a4b832f8388cb94254f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 7 Aug 2018 23:27:08 +0700 Subject: [PATCH 378/488] [clyp] Add support for token protected media (closes #17184) --- youtube_dl/extractor/clyp.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py index 57e643799..06d04de13 100644 --- a/youtube_dl/extractor/clyp.py +++ b/youtube_dl/extractor/clyp.py @@ -1,15 +1,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( float_or_none, - parse_iso8601, + unified_timestamp, ) class ClypIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://clyp.it/ojz2wfah', 'md5': '1d4961036c41247ecfdcc439c0cddcbb', 'info_dict': { @@ -21,13 +25,34 @@ class ClypIE(InfoExtractor): 'timestamp': 1443515251, 'upload_date': '20150929', }, - } + }, { + 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', + 'info_dict': { + 'id': 'b04p1odi', + 'ext': 'mp3', + 'title': 'GJ! (Reward Edit)', + 'description': 'Metal Resistance (THE ONE edition)', + 'duration': 177.789, + 'timestamp': 1528241278, + 'upload_date': '20180605', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): audio_id = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + token = qs.get('token', [None])[0] + + query = {} + if token: + query['token'] = token + metadata = self._download_json( - 'https://api.clyp.it/%s' % audio_id, audio_id) + 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) formats = [] for secure in ('', 'Secure'): @@ -45,7 +70,7 @@ class ClypIE(InfoExtractor): title = metadata['Title'] description = metadata.get('Description') duration = float_or_none(metadata.get('Duration')) - timestamp = parse_iso8601(metadata.get('DateCreated')) + timestamp = unified_timestamp(metadata.get('DateCreated')) return { 'id': audio_id, From b65e3b0636c4b992f3187bff7d5268c6891023a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 12 Aug 2018 01:47:10 +0700 Subject: [PATCH 379/488] [bitchute] Add extractor (closes #14052) --- youtube_dl/extractor/bitchute.py | 116 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 120 insertions(+) create mode 100644 youtube_dl/extractor/bitchute.py diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py new file mode 100644 index 000000000..28016f18b --- /dev/null +++ b/youtube_dl/extractor/bitchute.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import urlencode_postdata + + +class BitChuteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', + 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', + 'info_dict': { + 'id': 'szoMrox2JEI', + 'ext': 'mp4', + 'title': 'Fuck bitches get money', + 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Victoria X Rave', + }, + }, { + 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', + 'only_matching': True, + }, { + 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.bitchute.com/video/%s' % video_id, video_id) + + title = self._search_regex( + (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', + default=None) or self._og_search_description(webpage) + + formats = [ + {'url': mobj.group('url')} + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage)] + self._sort_formats(formats) + + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail') + uploader = self._html_search_regex( + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>', webpage, + 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } + + +class BitChuteChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.bitchute.com/channel/victoriaxrave/', + 'playlist_mincount': 185, + 'info_dict': { + 'id': 'victoriaxrave', + }, + } + + _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + + def _entries(self, channel_id): + channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id + for page_num in itertools.count(0): + data = self._download_json( + '%sextend/' % channel_url, channel_id, + 'Downloading channel page %d' % (page_num + 1), + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': page_num * 25 + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': channel_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': 'csrftoken=%s' % self._TOKEN, + }) + if data.get('success') is False: + break + html = data.get('html') + if not html: + break + video_ids = re.findall( + r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', + html) + if not video_ids: + break + for video_id in video_ids: + yield self.url_result( + 'https://www.bitchute.com/video/%s' % video_id, + ie=BitChuteIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.playlist_result( + self._entries(channel_id), playlist_id=channel_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c7a91a986..eb22d0999 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -118,6 +118,10 @@ from .bilibili import ( BiliBiliBangumiIE, ) from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, From de4c41b437c7a0126074d75ddeeccbf4470d0684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 12 Aug 2018 01:52:50 +0700 Subject: [PATCH 380/488] [bitchute] Improve page offset --- youtube_dl/extractor/bitchute.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 28016f18b..da263714a 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -81,14 +81,15 @@ class BitChuteChannelIE(InfoExtractor): def _entries(self, channel_id): channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id - for page_num in itertools.count(0): + offset = 0 + for page_num in itertools.count(1): data = self._download_json( '%sextend/' % channel_url, channel_id, - 'Downloading channel page %d' % (page_num + 1), + 'Downloading channel page %d' % page_num, data=urlencode_postdata({ 'csrfmiddlewaretoken': self._TOKEN, 'name': '', - 'offset': page_num * 25 + 'offset': offset, }), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': channel_url, @@ -105,6 +106,7 @@ class BitChuteChannelIE(InfoExtractor): html) if not video_ids: break + offset += len(video_ids) for video_id in video_ids: yield self.url_result( 'https://www.bitchute.com/video/%s' % video_id, From 4779420ce8e13d0df33189907e477146a2a0cefe Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 12 Aug 2018 05:31:18 +0100 Subject: [PATCH 381/488] [redbulltv] add support redbull.com tv URLs(closes #17218) --- youtube_dl/extractor/redbulltv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 243603676..7e8d58f38 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -10,7 +10,7 @@ from ..utils import ( class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.tv/video/(?P<id>AP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com/(?:[^/]+/)?tv)/video/(?P<id>AP-\w+)' _TESTS = [{ # film 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', @@ -35,6 +35,9 @@ class RedBullTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, }] def _real_extract(self, url): From 24e0cd709fae13d9192361dae27ecd292c6b1fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 13 Aug 2018 00:15:59 +0700 Subject: [PATCH 382/488] [raywenderlich] Adapt to site redesign (closes #17225) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/raywenderlich.py | 189 ++++++++++++++++++-------- 2 files changed, 137 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index eb22d0999..23ad42da9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -899,7 +899,10 @@ from .rai import ( RaiPlayPlaylistIE, RaiIE, ) -from .raywenderlich import RayWenderlichIE +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import RedBullTVIE diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py index 640c3ee23..5411ece21 100644 --- a/youtube_dl/extractor/raywenderlich.py +++ b/youtube_dl/extractor/raywenderlich.py @@ -4,24 +4,37 @@ import re from .common import InfoExtractor from .vimeo import VimeoIE +from ..compat import compat_str from ..utils import ( - extract_attributes, ExtractorError, - smuggle_url, - unsmuggle_url, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, urljoin, ) class RayWenderlichIE(InfoExtractor): - _VALID_URL = r'https?://videos\.raywenderlich\.com/courses/(?P<course_id>[^/]+)/lessons/(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' _TESTS = [{ - 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', 'info_dict': { 'id': '248377018', 'ext': 'mp4', - 'title': 'Testing In iOS Episode 1: Introduction', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', 'duration': 133, 'uploader': 'Ray Wenderlich', 'uploader_id': 'user3304672', @@ -34,69 +47,133 @@ class RayWenderlichIE(InfoExtractor): 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', 'info_dict': { 'title': 'Testing in iOS', - 'id': '105-testing-in-ios', + 'id': '3530-testing-in-ios', }, 'params': { 'noplaylist': False, }, 'playlist_count': 29, - }] + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) + course_id = self._match_id(url) - mobj = re.match(self._VALID_URL, url) - course_id, lesson_id = mobj.group('course_id', 'id') - video_id = '%s/%s' % (course_id, lesson_id) - - webpage = self._download_webpage(url, video_id) - - no_playlist = self._downloader.params.get('noplaylist') - if no_playlist or smuggled_data.get('force_video', False): - if no_playlist: - self.to_screen( - 'Downloading just video %s because of --no-playlist' - % video_id) - if '>Subscribe to unlock' in webpage: - raise ExtractorError( - 'This content is only available for subscribers', - expected=True) - vimeo_id = self._search_regex( - r'data-vimeo-id=["\'](\d+)', webpage, 'video id') - return self.url_result( - VimeoIE._smuggle_referrer( - 'https://player.vimeo.com/video/%s' % vimeo_id, url), - ie=VimeoIE.ie_key(), video_id=vimeo_id) - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' - % course_id) - - lesson_ids = set((lesson_id, )) - for lesson in re.findall( - r'(<a[^>]+\bclass=["\']lesson-link[^>]+>)', webpage): - attrs = extract_attributes(lesson) - if not attrs: - continue - lesson_url = attrs.get('href') - if not lesson_url: - continue - lesson_id = self._search_regex( - r'/lessons/(\d+)', lesson_url, 'lesson id', default=None) - if not lesson_id: - continue - lesson_ids.add(lesson_id) + webpage = self._download_webpage(url, course_id) entries = [] - for lesson_id in sorted(lesson_ids): + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) entries.append(self.url_result( - smuggle_url(urljoin(url, lesson_id), {'force_video': True}), - ie=RayWenderlichIE.ie_key())) + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) - title = self._search_regex( - r'class=["\']course-title[^>]+>([^<]+)', webpage, 'course title', - default=None) + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) return self.playlist_result(entries, course_id, title) From 57c68ec4c303f471b2519db21b9b56c080d88338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Aug 2018 22:51:44 +0700 Subject: [PATCH 383/488] [generic] Add support for expressen embeds --- youtube_dl/extractor/expressen.py | 23 ++++++++++++++++++++++- youtube_dl/extractor/generic.py | 6 ++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py index f61178012..934571472 100644 --- a/youtube_dl/extractor/expressen.py +++ b/youtube_dl/extractor/expressen.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( determine_ext, @@ -11,7 +13,13 @@ from ..utils import ( class ExpressenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?expressen\.se/ + (?:(?:tvspelare/video|videoplayer/embed)/)? + tv/(?:[^/]+/)* + (?P<id>[^/?#&]+) + ''' _TESTS = [{ 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', 'md5': '2fbbe3ca14392a6b1b36941858d33a45', @@ -28,8 +36,21 @@ class ExpressenIE(InfoExtractor): }, { 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/', 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/tvspelare/video/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/videoplayer/embed/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?expressen\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', + webpage)] + def _real_extract(self, url): display_id = self._match_id(url) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 43218c3a4..83a31f3d3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -114,6 +114,7 @@ from .indavideo import IndavideoEmbedIE from .apa import APAIE from .foxnews import FoxNewsIE from .viqeo import ViqeoIE +from .expressen import ExpressenIE class GenericIE(InfoExtractor): @@ -3109,6 +3110,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) + expressen_urls = ExpressenIE._extract_urls(webpage) + if expressen_urls: + return self.playlist_from_matches( + expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: From 60c0856223391864135694197db2d17125fe4e8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Aug 2018 23:27:12 +0700 Subject: [PATCH 384/488] [utils] Use pure browser header for User-Agent (closes #17236) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 29cafd8f0..0c830ba71 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -82,7 +82,7 @@ def register_socks_protocols(): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From d6ef8b4dd4230233b8ada6b1f8d7083df86a7cd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Aug 2018 00:11:41 +0700 Subject: [PATCH 385/488] [nova] Fix extraction (closes #17241) --- youtube_dl/extractor/nova.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 06cb8cb3f..edf8b82c6 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + js_to_json, unified_strdate, + url_or_none, ) @@ -111,8 +113,21 @@ class NovaIE(InfoExtractor): webpage, 'video id') config_url = self._search_regex( - r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', + r'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', webpage, 'config url', default=None) + config_params = {} + + if not config_url: + player = self._parse_json( + self._search_regex( + r'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s*:\s*["\']?\d+.+?})\s*\)', webpage, + 'player', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if player: + config_url = url_or_none(player.get('configUrl')) + params = player.get('configParams') + if isinstance(params, dict): + config_params = params if not config_url: DEFAULT_SITE_ID = '23000' @@ -127,14 +142,20 @@ class NovaIE(InfoExtractor): } site_id = self._search_regex( - r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) + r'site=(\d+)', webpage, 'site id', default=None) or SITES.get( + site, DEFAULT_SITE_ID) - config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' - % (site_id, video_id)) + config_url = 'https://api.nova.cz/bin/player/videojs/config.php' + config_params = { + 'site': site_id, + 'media': video_id, + 'quality': 3, + 'version': 1, + } config = self._download_json( config_url, display_id, - 'Downloading config JSON', + 'Downloading config JSON', query=config_params, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) mediafile = config['mediafile'] From 0a74b4519147aec45dc39756dc55b5755beefbc2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 17 Aug 2018 11:59:22 +0100 Subject: [PATCH 386/488] [cwtv] fix extraction(closes #17256) --- youtube_dl/extractor/cwtv.py | 82 +++++++++++++----------------------- 1 file changed, 29 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f4cf0f1c5..224a1fb5d 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + parse_age_limit, parse_iso8601, + smuggle_url, + str_or_none, ) @@ -40,10 +43,15 @@ class CWTVIE(InfoExtractor): 'duration': 1263, 'series': 'Whose Line Is It Anyway?', 'season_number': 11, - 'season': '11', 'episode_number': 20, 'upload_date': '20151006', 'timestamp': 1444107300, + 'age_limit': 14, + 'uploader': 'CWTV', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', @@ -58,60 +66,28 @@ class CWTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = None - formats = [] - for partner in (154, 213): - vdata = self._download_json( - 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False) - if not vdata: - continue - video_data = vdata - for quality, quality_data in vdata.get('videos', {}).items(): - quality_url = quality_data.get('uri') - if not quality_url: - continue - if quality == 'variantplaylist': - formats.extend(self._extract_m3u8_formats( - quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - tbr = int_or_none(quality_data.get('bitrate')) - format_id = 'http' + ('-%d' % tbr if tbr else '') - if self._is_valid_url(quality_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': quality_url, - 'tbr': tbr, - }) - video_metadata = video_data['assetFields'] - ism_url = video_metadata.get('smoothStreamingUrl') - if ism_url: - formats.extend(self._extract_ism_formats( - ism_url, video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) + video_data = self._download_json( + 'http://images.cwtv.com/feed/mobileapp/video-meta/apiversion_8/guid_' + video_id, + video_id)['video'] + title = video_data['title'] + mpx_url = video_data.get('mpx_url') or 'http://link.theplatform.com/s/cwtv/media/guid/2703454149/%s?formats=M3U' % video_id - thumbnails = [{ - 'url': image['uri'], - 'width': image.get('width'), - 'height': image.get('height'), - } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None - - subtitles = { - 'en': [{ - 'url': video_metadata['UnicornCcUrl'], - }], - } if video_metadata.get('UnicornCcUrl') else None + season = str_or_none(video_data.get('season')) + episode = str_or_none(video_data.get('episode')) + if episode and season: + episode = episode.lstrip(season) return { + '_type': 'url_transparent', 'id': video_id, - 'title': video_metadata['title'], - 'description': video_metadata.get('description'), - 'duration': int_or_none(video_metadata.get('duration')), - 'series': video_metadata.get('seriesName'), - 'season_number': int_or_none(video_metadata.get('seasonNumber')), - 'season': video_metadata.get('seasonName'), - 'episode_number': int_or_none(video_metadata.get('episodeNumber')), - 'timestamp': parse_iso8601(video_data.get('startTime')), - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, + 'title': title, + 'url': smuggle_url(mpx_url, {'force_smil_url': True}), + 'description': video_data.get('description_long'), + 'duration': int_or_none(video_data.get('duration_secs')), + 'series': video_data.get('series_name'), + 'season_number': int_or_none(season), + 'episode_number': int_or_none(episode), + 'timestamp': parse_iso8601(video_data.get('start_time')), + 'age_limit': parse_age_limit(video_data.get('rating')), + 'ie_key': 'ThePlatform', } From 6f356cbbcf0bb3e3e3abce912d451d1852999b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Aug 2018 02:05:07 +0700 Subject: [PATCH 387/488] [bbccouk] Extend _ID_REGEX (closes #17270) --- youtube_dl/extractor/bbc.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 641bf6073..abcfa301d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -29,7 +29,7 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pbw][\da-z]{7}' + _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -236,6 +236,12 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/m00005xn', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s', + 'only_matching': True, }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' From bf1245d236f4199497a8cbc42ff0baa5f92f9edc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Aug 2018 02:15:48 +0700 Subject: [PATCH 388/488] [lci] Fix extraction (closes #17274) --- youtube_dl/extractor/lci.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lci.py b/youtube_dl/extractor/lci.py index af34829e7..920872f5c 100644 --- a/youtube_dl/extractor/lci.py +++ b/youtube_dl/extractor/lci.py @@ -20,5 +20,7 @@ class LCIIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - wat_id = self._search_regex(r'data-watid=[\'"](\d+)', webpage, 'wat id') + wat_id = self._search_regex( + (r'data-watid=[\'"](\d+)', r'idwat["\']?\s*:\s*["\']?(\d+)'), + webpage, 'wat id') return self.url_result('wat:' + wat_id, 'Wat', wat_id) From eda86b4335a464c2744adb537f1296eff24f5bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 21 Aug 2018 23:45:18 +0700 Subject: [PATCH 389/488] [anvato] Fallback to generic API key for access key to API key lookup (closes #16788, closes #17254) --- youtube_dl/extractor/anvato.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index f6a78eb5d..84e841035 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -134,9 +134,33 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } + _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' + _TESTS = [{ + # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 + 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', + 'info_dict': { + 'id': '4465496', + 'ext': 'mp4', + 'title': 'VIDEO: Humpback whale breaches right next to NH boat', + 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', + 'duration': 22, + 'timestamp': 1534855680, + 'upload_date': '20180821', + 'uploader': 'ANV', + }, + 'params': { + 'skip_download': True, + }, + }, { + # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ + 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', + 'only_matching': True, + }] + def __init__(self, *args, **kwargs): super(AnvatoIE, self).__init__(*args, **kwargs) self.__server_time = None @@ -169,7 +193,8 @@ class AnvatoIE(InfoExtractor): 'api': { 'anvrid': anvrid, 'anvstk': md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])), + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))), 'anvts': server_time, }, } @@ -284,5 +309,6 @@ class AnvatoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) access_key, video_id = mobj.group('access_key_or_mcp', 'id') if access_key not in self._ANVACK_TABLE: - access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key] + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( + access_key) or access_key return self._get_anvato_videos(access_key, video_id) From 28f96cf407cecbac21fe2c7d3417928dae3ab3b5 Mon Sep 17 00:00:00 2001 From: hmlinaric <mlinaric.hrvoje@gmail.com> Date: Tue, 21 Aug 2018 19:06:27 +0200 Subject: [PATCH 390/488] [6play] Add support for play.rtl.hr --- youtube_dl/extractor/sixplay.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index a363221bc..207ab4477 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -19,7 +19,7 @@ from ..utils import ( class SixPlayIE(InfoExtractor): IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay.be)/.+?-c_)(?P<id>[0-9]+)' + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr)/.+?-c_)(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', @@ -32,6 +32,9 @@ class SixPlayIE(InfoExtractor): }, { 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', 'only_matching': True, + }, { + 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', + 'only_matching': True, }] def _real_extract(self, url): @@ -39,6 +42,7 @@ class SixPlayIE(InfoExtractor): service, consumer_name = { '6play.fr': ('6play', 'm6web'), 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), }.get(domain, ('6play', 'm6web')) data = self._download_json( From 52007de8ca6fcc2b72cbfb3fe026ea63cda88f25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Aug 2018 01:14:47 +0700 Subject: [PATCH 391/488] [go] Add support for disneynow.go.com (closes #16299, closes #17264) --- youtube_dl/extractor/go.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index e781405f2..ec9dd6e3a 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,7 +36,8 @@ class GoIE(AdobePassIE): 'requestor_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))'\ + % '|'.join(list(_SITE_INFO.keys()) + ['disneynow']) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { @@ -62,6 +63,14 @@ class GoIE(AdobePassIE): }, { 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', 'only_matching': True, + }, { + # brand 004 + 'url': 'http://disneynow.go.com/shows/big-hero-6-the-series/season-01/episode-10-mr-sparkles-loses-his-sparkle/vdka4637915', + 'only_matching': True, + }, { + # brand 008 + 'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', + 'only_matching': True, }] def _extract_videos(self, brand, video_id='-1', show_id='-1'): @@ -72,14 +81,23 @@ class GoIE(AdobePassIE): def _real_extract(self, url): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() - site_info = self._SITE_INFO[sub_domain] - brand = site_info['brand'] - if not video_id: - webpage = self._download_webpage(url, display_id) + site_info = self._SITE_INFO.get(sub_domain, {}) + brand = site_info.get('brand') + if not video_id or not site_info: + webpage = self._download_webpage(url, display_id or video_id) video_id = self._search_regex( # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None) + r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', + default=None) + if not site_info: + brand = self._search_regex( + (r'data-brand=\s*["\']\s*(\d+)', + r'data-page-brand=\s*["\']\s*(\d+)'), webpage, 'brand', + default='004') + site_info = next( + si for _, si in self._SITE_INFO.items() + if si.get('brand') == brand) if not video_id: # show extraction works for Disney, DisneyJunior and DisneyXD # ABC and Freeform has different layout From db192b29329696547b422d904819321077efddfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Aug 2018 01:44:22 +0700 Subject: [PATCH 392/488] [yourporn] Add extractor (closes #17298) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/yourporn.py | 41 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/yourporn.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 23ad42da9..d8e86c277 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1424,6 +1424,7 @@ from .younow import ( YouNowMomentIE, ) from .youporn import YouPornIE +from .yourporn import YourPornIE from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py new file mode 100644 index 000000000..6602f7c03 --- /dev/null +++ b/youtube_dl/extractor/yourporn.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import urljoin + + +class YourPornIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourporn\.sexy/post/(?P<id>[^/?#&.]+)' + _TEST = { + 'url': 'https://yourporn.sexy/post/57ffcb2e1179b.html', + 'md5': '6f8682b6464033d87acaa7a8ff0c092e', + 'info_dict': { + 'id': '57ffcb2e1179b', + 'ext': 'mp4', + 'title': 'md5:c9f43630bd968267672651ba905a7d35', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = urljoin(url, self._parse_json( + self._search_regex( + r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info', + group='data'), + video_id)[video_id]) + + title = (self._search_regex( + r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', + default=None) or self._og_search_description(webpage)).strip() + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + } From df4d817bc3802e776c5056c3288953aa0ff817b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Aug 2018 02:19:30 +0700 Subject: [PATCH 393/488] [kinopoisk] Add extractor (closes #17283) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kinopoisk.py | 70 ++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 youtube_dl/extractor/kinopoisk.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d8e86c277..9fc1cfa70 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -520,6 +520,7 @@ from .keezmovies import KeezMoviesIE from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kinopoisk import KinoPoiskIE from .keek import KeekIE from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE diff --git a/youtube_dl/extractor/kinopoisk.py b/youtube_dl/extractor/kinopoisk.py new file mode 100644 index 000000000..9e8d01f53 --- /dev/null +++ b/youtube_dl/extractor/kinopoisk.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + dict_get, + int_or_none, +) + + +class KinoPoiskIE(InfoExtractor): + _GEO_COUNTRIES = ['RU'] + _VALID_URL = r'https?://(?:www\.)?kinopoisk\.ru/film/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.kinopoisk.ru/film/81041/watch/', + 'md5': '4f71c80baea10dfa54a837a46111d326', + 'info_dict': { + 'id': '81041', + 'ext': 'mp4', + 'title': 'Алеша попович и тугарин змей', + 'description': 'md5:43787e673d68b805d0aa1df5a5aea701', + 'thumbnail': r're:^https?://.*', + 'duration': 4533, + 'age_limit': 12, + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'https://www.kinopoisk.ru/film/81041', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://ott-widget.kinopoisk.ru/v1/kp/', video_id, + query={'kpId': video_id}) + + data = self._parse_json( + self._search_regex( + r'(?s)<script[^>]+\btype=["\']application/json[^>]+>(.+?)<', + webpage, 'data'), + video_id)['models'] + + film = data['filmStatus'] + title = film.get('title') or film['originalTitle'] + + formats = self._extract_m3u8_formats( + data['playlistEntity']['uri'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = dict_get( + film, ('descriptscription', 'description', + 'shortDescriptscription', 'shortDescription')) + thumbnail = film.get('coverUrl') or film.get('posterUrl') + duration = int_or_none(film.get('duration')) + age_limit = int_or_none(film.get('restrictionAge')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } From b662273989ec18e9905b67467b5feded30f19d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Aug 2018 02:28:25 +0700 Subject: [PATCH 394/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 665503827..a4cc6e4ac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version <unreleased> + +Core +* [utils] Use pure browser header for User-Agent (#17236) + +Extractors ++ [kinopoisk] Add support for kinopoisk.ru (#17283) ++ [yourporn] Add support for yourporn.sexy (#17298) ++ [go] Add support for disneynow.go.com (#16299, #17264) ++ [6play] Add support for play.rtl.hr (#17249) +* [anvato] Fallback to generic API key for access-key-to-API-key lookup + (#16788, #17254) +* [lci] Fix extraction (#17274) +* [bbccouk] Extend id URL regular expression (#17270) +* [cwtv] Fix extraction (#17256) +* [nova] Fix extraction (#17241) ++ [generic] Add support for expressen embeds +* [raywenderlich] Adapt to site redesign (#17225) ++ [redbulltv] Add support redbull.com tv URLs (#17218) ++ [bitchute] Add support for bitchute.com (#14052) ++ [clyp] Add support for token protected media (#17184) +* [imdb] Fix extension extraction (#17167) + + version 2018.08.04 Extractors From 4c86163b60ee3acc7f4b02c59c80d9027f378288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Aug 2018 02:32:18 +0700 Subject: [PATCH 395/488] release 2018.08.22 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 20433e915..6d54efbc3 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.08.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.08.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.08.22*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.08.22** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.08.04 +[debug] youtube-dl version 2018.08.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a4cc6e4ac..c34e9682a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.08.22 Core * [utils] Use pure browser header for User-Agent (#17236) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 4bf2ec81b..5d2a355ff 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -108,6 +108,8 @@ - **BiliBili** - **BioBioChileTV** - **BIQLE** + - **BitChute** + - **BitChuteChannel** - **BleacherReport** - **BleacherReportCMS** - **blinkx** @@ -405,6 +407,7 @@ - **Ketnet** - **KhanAcademy** - **KickStarter** + - **KinoPoisk** - **KonserthusetPlay** - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью @@ -696,6 +699,7 @@ - **RaiPlayLive** - **RaiPlayPlaylist** - **RayWenderlich** + - **RayWenderlichCourse** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** @@ -1093,6 +1097,7 @@ - **YouNowLive** - **YouNowMoment** - **YouPorn** + - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - **youtube:channel**: YouTube.com channels diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 67394fa01..f68848389 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.08.04' +__version__ = '2018.08.22' From c707d2067d649f23f03cf11995d1eeb665d51838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Aug 2018 23:18:07 +0700 Subject: [PATCH 396/488] [nova:embed] Add extractor (closes #17282) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/nova.py | 138 ++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9fc1cfa70..a8b89bcde 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -741,7 +741,10 @@ from .nonktube import NonkTubeIE from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE -from .nova import NovaIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) from .novamov import ( AuroraVidIE, CloudTimeIE, diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index edf8b82c6..80186ec50 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,30 +6,90 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, js_to_json, + qualities, unified_strdate, url_or_none, ) +class NovaEmbedIE(InfoExtractor): + _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', + 'md5': 'b3834f6de5401baabf31ed57456463f7', + 'info_dict': { + 'id': '8o0n0r', + 'ext': 'mp4', + 'title': '2180. díl', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2578, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + bitrates = self._parse_json( + self._search_regex( + r'(?s)bitrates\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + formats = [] + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + continue + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), + }) + break + f['format_id'] = f_id + formats.append(f) + self._sort_formats(formats) + + title = self._og_search_title( + webpage, default=None) or self._search_regex( + (r'<value>(?P<title>[^<]+)', + r'videoTitle\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + r'poster\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='value') + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ - 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', - 'info_dict': { - 'id': '1608920', - 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', - 'ext': 'flv', - 'title': 'Duel: Michal Hrdlička a Petr Suchoň', - 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', - 'thumbnail': r're:^https?://.*\.(?:jpg)', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', 'info_dict': { @@ -40,33 +100,6 @@ class NovaIE(InfoExtractor): 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', 'thumbnail': r're:^https?://.*\.(?:jpg)', } - }, { - 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove', - 'info_dict': { - 'id': '1756825', - 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', - 'ext': 'flv', - 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', - 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags - 'thumbnail': r're:^https?://.*\.(?:jpg)', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/', - 'info_dict': { - 'id': '1756858', - 'ext': 'flv', - 'title': 'Televizní noviny - 30. 5. 2015', - 'thumbnail': r're:^https?://.*\.(?:jpg)', - 'upload_date': '20150530', - }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', 'info_dict': { @@ -81,6 +114,20 @@ class NovaIE(InfoExtractor): # rtmp download 'skip_download': True, } + }, { + # media.cms.nova.cz embed + 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', + 'info_dict': { + 'id': '8o0n0r', + 'ext': 'mp4', + 'title': '2180. díl', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2578, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [NovaEmbedIE.ie_key()], }, { 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 'only_matching': True, @@ -105,6 +152,15 @@ class NovaIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + # novaplus + embed_id = self._search_regex( + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', + webpage, 'embed url', default=None) + if embed_id: + return self.url_result( + 'https://media.cms.nova.cz/embed/%s' % embed_id, + ie=NovaEmbedIE.ie_key(), video_id=embed_id) + video_id = self._search_regex( [r"(?:media|video_id)\s*:\s*'(\d+)'", r'media=(\d+)', From 135e6a1c107f49e7968013a57c33eea615406b96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Aug 2018 02:36:56 +0700 Subject: [PATCH 397/488] [vidzi] Add support for vidzi.nu (closes #17316) --- youtube_dl/extractor/vidzi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index d70283479..42ea4952c 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -13,7 +13,7 @@ from ..utils import ( class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', @@ -35,6 +35,9 @@ class VidziIE(InfoExtractor): }, { 'url': 'https://vidzi.si/rph9gztxj1et.html', 'only_matching': True, + }, { + 'url': 'http://vidzi.nu/cghql9yq6emu.html', + 'only_matching': True, }] def _real_extract(self, url): From beff09505cc4a7a980901c6e48adc1620aa417aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Aug 2018 04:00:35 +0700 Subject: [PATCH 398/488] [xfileshare] Add support for vidto.se (closes #17317) --- youtube_dl/extractor/xfileshare.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index bc3239f68..b38c7a7b3 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -23,7 +23,7 @@ class XFileShareIE(InfoExtractor): (r'powerwatch\.pw', 'PowerWatch'), (r'rapidvideo\.ws', 'Rapidvideo.ws'), (r'thevideobee\.to', 'TheVideoBee'), - (r'vidto\.me', 'Vidto'), + (r'vidto\.(?:me|se)', 'Vidto'), (r'streamin\.to', 'Streamin.To'), (r'xvidstage\.com', 'XVIDSTAGE'), (r'vidabc\.com', 'Vid ABC'), @@ -115,7 +115,10 @@ class XFileShareIE(InfoExtractor): 'only_matching': True, }, { 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', - 'only_matching': True + 'only_matching': True, + }, { + 'url': 'http://vidto.se/1tx1pf6t12cg.html', + 'only_matching': True, }] @staticmethod From e0b6e988710037791057cd0b65f795d44e2f534c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Aug 2018 23:12:53 +0700 Subject: [PATCH 399/488] [generic] Allow relative src for videojs embeds (closes #17324) --- youtube_dl/extractor/generic.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 83a31f3d3..096348513 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2071,6 +2071,21 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # videojs embed + 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', + 'info_dict': { + 'id': 'shell', + 'ext': 'mp4', + 'title': 'Доставщик пиццы спросил разрешения сыграть на фортепиано', + 'description': 'md5:89209cdc587dab1e4a090453dbaa2cb1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download MPD manifest'], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -3152,8 +3167,8 @@ class GenericIE(InfoExtractor): sources = [sources] formats = [] for source in sources: - src = url_or_none(source.get('src')) - if not src: + src = source.get('src') + if not src or not isinstance(src, compat_str): continue src = compat_urlparse.urljoin(url, src) src_type = source.get('type') From dac6f7654a577533ebcb825f34f0d7735274b205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Aug 2018 20:07:52 +0700 Subject: [PATCH 400/488] [tvplayhome] Add extractor (closes #17344) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tvplay.py | 114 ++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a8b89bcde..6012d0f5f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1181,6 +1181,7 @@ from .tvp import ( from .tvplay import ( TVPlayIE, ViafreeIE, + TVPlayHomeIE, ) from .tvplayer import TVPlayerIE from .tweakers import TweakersIE diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index d3adab457..8f1ff3b76 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -32,12 +32,12 @@ class TVPlayIE(InfoExtractor): https?:// (?:www\.)? (?: - tvplay(?:\.skaties)?\.lv/parraides| - (?:tv3play|play\.tv3)\.lt/programos| + tvplay(?:\.skaties)?\.lv(?:/parraides)?| + (?:tv3play|play\.tv3)\.lt(?:/programos)?| tv3play(?:\.tv3)?\.ee/sisu| (?:tv(?:3|6|8|10)play|viafree)\.se/program| (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| - play\.novatv\.bg/programi + play\.nova(?:tv)?\.bg/programi ) /(?:[^/]+/)+ ) @@ -203,10 +203,18 @@ class TVPlayIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', + 'only_matching': True, + }, { 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, }, + { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/418113/?autostart=true', + 'only_matching': True, + }, { # views is null 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', @@ -288,6 +296,7 @@ class TVPlayIE(InfoExtractor): 'url': m.group('url'), 'app': m.group('app'), 'play_path': m.group('playpath'), + 'preference': -1, }) else: fmt.update({ @@ -447,3 +456,102 @@ class ViafreeIE(InfoExtractor): 'skip_rtmp': True, }), ie=TVPlayIE.ie_key(), video_id=video_id) + + +class TVPlayHomeIE(InfoExtractor): + _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', + 'info_dict': { + 'id': '366367', + 'ext': 'mp4', + 'title': 'Aferistai', + 'description': 'Aferistai. Kalėdinė pasaka.', + 'series': 'Aferistai [N-7]', + 'season': '1 sezonas', + 'season_number': 1, + 'duration': 464, + 'timestamp': 1394209658, + 'upload_date': '20140307', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', + 'only_matching': True, + }, { + 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-asset-id\s*=\s*["\'](\d{5,7})\b', webpage, 'video id', + default=None) + + if video_id: + return self.url_result( + 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) + + m3u8_url = self._search_regex( + r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + title = self._search_regex( + r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='value') or self._html_search_meta( + 'title', webpage, default=None) or self._og_search_title( + webpage) + + description = self._html_search_meta( + 'description', webpage, + default=None) or self._og_search_description(webpage) + + thumbnail = self._search_regex( + r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'thumbnail', default=None, group='url') or self._html_search_meta( + 'thumbnail', webpage, default=None) or self._og_search_thumbnail( + webpage) + + duration = int_or_none(self._search_regex( + r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration', + fatal=False)) + + season = self._search_regex( + (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1', + r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'season', default=None, group='value') + season_number = int_or_none(self._search_regex( + r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', + default=None)) + episode = self._search_regex( + r'(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'episode', + default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', + default=None)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'formats': formats, + } From 287cf7e443814b405caac87449ee34fb055cc6b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Aug 2018 20:08:31 +0700 Subject: [PATCH 401/488] [generic] Remove unused import --- youtube_dl/extractor/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 096348513..229dfda1b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -32,7 +32,6 @@ from ..utils import ( unified_strdate, unsmuggle_url, UnsupportedError, - url_or_none, xpath_text, ) from .commonprotocols import RtmpIE From dd88fd65a5eda12694104afa3fa891c640de71c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Aug 2018 21:41:55 +0700 Subject: [PATCH 402/488] [webofstories:playlist] Fix extraction (closes #16914) --- youtube_dl/extractor/webofstories.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index 1eb1f6702..f2b8d19b4 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + orderedSet, +) class WebOfStoriesIE(InfoExtractor): @@ -133,8 +136,10 @@ class WebOfStoriesPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) entries = [ - self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories') - for video_number in set(re.findall(r'href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) + self.url_result( + 'http://www.webofstories.com/play/%s' % video_id, + 'WebOfStories', video_id=video_id) + for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage)) ] title = self._search_regex( From 02df41354cfae04555b3f78b2fcca3e66ec7faba Mon Sep 17 00:00:00 2001 From: Andrew Udvare <Tatsh@users.noreply.github.com> Date: Mon, 27 Aug 2018 11:04:56 -0400 Subject: [PATCH 403/488] [bitchute] Fix extraction by pass custom User-Agent --- youtube_dl/extractor/bitchute.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index da263714a..446a1ab19 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -33,7 +33,9 @@ class BitChuteIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://www.bitchute.com/video/%s' % video_id, video_id) + 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + }) title = self._search_regex( (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), From 409b9324da3ead199b26081b228b042b0b494f8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Aug 2018 02:14:47 +0700 Subject: [PATCH 404/488] [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) (closes #17361) --- youtube_dl/extractor/youtube.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 117a57911..0442906df 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -64,7 +64,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' def _set_language(self): self._set_cookie( @@ -2123,7 +2123,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) @@ -2261,6 +2261,10 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, }] def _real_initialize(self): From 56213aff1d8bd8f06a940fcbd65a96a3baad9d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Aug 2018 03:07:18 +0700 Subject: [PATCH 405/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ChangeLog b/ChangeLog index c34e9682a..71174bbe6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +version <unreleased> + +Extractors ++ [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) + (#17361) +* [bitchute] Fix extraction by pass custom User-Agent (#17360) +* [webofstories:playlist] Fix extraction (#16914) ++ [tvplayhome] Add support for new tvplay URLs (#17344) ++ [generic] Allow relative src for videojs embeds (#17324) ++ [xfileshare] Add support for vidto.se (#17317) ++ [vidzi] Add support for vidzi.nu (#17316) ++ [nova:embed] Add support for media.cms.nova.cz (#17282) + + version 2018.08.22 Core From eebbce5656fd7d626594aa54d2d1ec545d9ae53a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Aug 2018 03:10:09 +0700 Subject: [PATCH 406/488] release 2018.08.28 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6d54efbc3..0816c4f5f 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.08.22*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.08.22** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.08.28*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.08.28** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.08.22 +[debug] youtube-dl version 2018.08.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 71174bbe6..49f44a6e6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.08.28 Extractors + [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5d2a355ff..5beb9bc17 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -580,6 +580,7 @@ - **Normalboots** - **NosVideo** - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz + - **NovaEmbed** - **nowness** - **nowness:playlist** - **nowness:series** @@ -916,6 +917,7 @@ - **tvp:embed**: Telewizja Polska - **tvp:series** - **TVPlayer** + - **TVPlayHome** - **Tweakers** - **twitch:chapter** - **twitch:clips** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f68848389..3e3fe1375 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.08.22' +__version__ = '2018.08.28' From 8959018a5f3efc02f7402c44f4d3950c0811b97a Mon Sep 17 00:00:00 2001 From: Andrew Udvare <audvare@gmail.com> Date: Fri, 16 Mar 2018 20:11:47 -0400 Subject: [PATCH 407/488] [utils] Skip remote IP addresses non matching to source address' IP version (closes #13422) --- youtube_dl/utils.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0c830ba71..2be8c95cd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -882,7 +882,40 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): kwargs['strict'] = True hc = http_class(*args, **compat_kwargs(kwargs)) source_address = ydl_handler._params.get('source_address') + if source_address is not None: + filter_for = socket.AF_INET if '.' in source_address else socket.AF_INET6 + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + ip_addrs = [addr for addr in addrs if addr[0] == filter_for] + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except socket.error as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise socket.error('Unknown error occurred') + hc._create_connection = _create_connection + sa = (source_address, 0) if hasattr(hc, 'source_address'): # Python 2.7+ hc.source_address = sa From 9e21e6d96bed929ba57b8ce775e2a0e29e54dd60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 29 Aug 2018 01:17:21 +0700 Subject: [PATCH 408/488] [utils] Improve remote address skipping and add support for python 2.6 (closes #17362) --- youtube_dl/utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2be8c95cd..bcfb72d43 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -49,7 +49,6 @@ from .compat import ( compat_os_name, compat_parse_qs, compat_shlex_quote, - compat_socket_create_connection, compat_str, compat_struct_pack, compat_struct_unpack, @@ -884,7 +883,6 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): source_address = ydl_handler._params.get('source_address') if source_address is not None: - filter_for = socket.AF_INET if '.' in source_address else socket.AF_INET6 # This is to workaround _create_connection() from socket where it will try all # address data from getaddrinfo() including IPv6. This filters the result from # getaddrinfo() based on the source_address value. @@ -894,7 +892,13 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): host, port = address err = None addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) - ip_addrs = [addr for addr in addrs if addr[0] == filter_for] + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise socket.error( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) for res in ip_addrs: af, socktype, proto, canonname, sa = res sock = None @@ -913,15 +917,15 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): if err is not None: raise err else: - raise socket.error('Unknown error occurred') - hc._create_connection = _create_connection - + raise socket.error('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection sa = (source_address, 0) if hasattr(hc, 'source_address'): # Python 2.7+ hc.source_address = sa else: # Python 2.6 def _hc_connect(self, *args, **kwargs): - sock = compat_socket_create_connection( + sock = _create_connection( (self.host, self.port), self.timeout, sa) if is_https: self.sock = ssl.wrap_socket( From 73f3bdbeb4b62a1286bcd74fdd1d5741740ea845 Mon Sep 17 00:00:00 2001 From: Leonardo Taccari <iamleot@gmail.com> Date: Thu, 30 Aug 2018 21:15:46 +0200 Subject: [PATCH 409/488] [internazionale] Fix extraction of non-available-abroad videos --- youtube_dl/extractor/internazionale.py | 27 +++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/internazionale.py b/youtube_dl/extractor/internazionale.py index 10ba1f6cf..676e8e269 100644 --- a/youtube_dl/extractor/internazionale.py +++ b/youtube_dl/extractor/internazionale.py @@ -7,7 +7,7 @@ from ..utils import unified_timestamp class InternazionaleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood', 'md5': '3e39d32b66882c1218e305acbf8348ca', 'info_dict': { @@ -23,7 +23,23 @@ class InternazionaleIE(InfoExtractor): 'params': { 'format': 'bestvideo', }, - } + }, { + 'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi', + 'md5': '9db8663704cab73eb972d1cee0082c79', + 'info_dict': { + 'id': '761344', + 'display_id': 'telefono-stare-con-noi-stessi', + 'ext': 'mp4', + 'title': 'Usiamo il telefono per evitare di stare con noi stessi', + 'description': 'md5:75ccfb0d6bcefc6e7428c68b4aa1fe44', + 'timestamp': 1535528954, + 'upload_date': '20180829', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'format': 'bestvideo', + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -40,8 +56,13 @@ class InternazionaleIE(InfoExtractor): DATA_RE % 'job-id', webpage, 'video id', group='value') video_path = self._search_regex( DATA_RE % 'video-path', webpage, 'video path', group='value') + video_available_abroad = self._search_regex( + DATA_RE % 'video-available_abroad', webpage, + 'video available aboard', default='1', group='value') + video_available_abroad = video_available_abroad == '1' - video_base = 'https://video.internazionale.it/%s/%s.' % (video_path, video_id) + video_base = 'https://video%s.internazionale.it/%s/%s.' % \ + ('' if video_available_abroad else '-ita', video_path, video_id) formats = self._extract_m3u8_formats( video_base + 'm3u8', display_id, 'mp4', From 14b7a24c19d70ec172eaedf799bcf81445ac9403 Mon Sep 17 00:00:00 2001 From: Parmjit Virk <pvirk@mts.net> Date: Thu, 30 Aug 2018 14:32:35 -0500 Subject: [PATCH 410/488] [bandcamp] Extract track_number (closes #17266) --- youtube_dl/extractor/bandcamp.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index b8514734d..bc4c5165a 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -44,6 +44,17 @@ class BandcampIE(InfoExtractor): 'title': 'Ben Prunty - Lanius (Battle)', 'uploader': 'Ben Prunty', }, + }, { + 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', + 'info_dict': { + 'id': '2584466013', + 'ext': 'mp3', + 'title': 'Hail to Fire', + 'track_number': 5, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -82,6 +93,7 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'formats': formats, 'duration': float_or_none(data.get('duration')), + 'track_number': int_or_none(data.get('track_num')), } else: raise ExtractorError('No free songs found') From 4991e16c2a62ecd1a81cbb9ff6563885c1bf6210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 31 Aug 2018 03:35:55 +0700 Subject: [PATCH 411/488] [bandcamp] Extract more metadata (closes #13197) --- youtube_dl/extractor/bandcamp.py | 218 +++++++++++++++++++------------ 1 file changed, 134 insertions(+), 84 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index bc4c5165a..f14b407dc 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import random import re import time @@ -16,15 +15,18 @@ from ..utils import ( int_or_none, KNOWN_EXTENSIONS, parse_filesize, + str_or_none, + try_get, unescapeHTML, update_url_query, unified_strdate, + unified_timestamp, url_or_none, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -36,24 +38,44 @@ class BandcampIE(InfoExtractor): }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { + # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '0369ace6b939f0927e62c67a1a8d9fa7', + 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', 'title': 'Ben Prunty - Lanius (Battle)', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ben Prunty', + 'timestamp': 1396508491, + 'upload_date': '20140403', + 'release_date': '20140403', + 'duration': 260.877, + 'track': 'Lanius (Battle)', + 'track_number': 1, + 'track_id': '2650410135', + 'artist': 'Ben Prunty', + 'album': 'FTL: Advanced Edition Soundtrack', }, }, { + # no free download, mp3 128 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', + 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7', 'info_dict': { 'id': '2584466013', 'ext': 'mp3', - 'title': 'Hail to Fire', + 'title': 'Mastodon - Hail to Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mastodon', + 'timestamp': 1322005399, + 'upload_date': '20111122', + 'release_date': '20040207', + 'duration': 120.79, + 'track': 'Hail to Fire', 'track_number': 5, - }, - 'params': { - 'skip_download': True, + 'track_id': '2584466013', + 'artist': 'Mastodon', + 'album': 'Call of the Mastodon', }, }] @@ -62,19 +84,23 @@ class BandcampIE(InfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) thumbnail = self._html_search_meta('og:image', webpage, default=None) - m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) - if not m_download: - m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) - if m_trackinfo: - json_code = m_trackinfo.group(1) - data = json.loads(json_code)[0] - track_id = compat_str(data['id']) - if not data.get('file'): - raise ExtractorError('Not streamable', video_id=track_id, expected=True) + track_id = None + track = None + track_number = None + duration = None - formats = [] - for format_id, format_url in data['file'].items(): + formats = [] + track_info = self._parse_json( + self._search_regex( + r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', + webpage, 'track info', default='{}'), title) + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): + for format_id, format_url in file_.items(): + if not url_or_none(format_url): + continue ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, @@ -84,86 +110,110 @@ class BandcampIE(InfoExtractor): 'acodec': ext, 'abr': int_or_none(abr_str), }) + track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_number = int_or_none(track_info.get('track_num')) + duration = float_or_none(track_info.get('duration')) - self._sort_formats(formats) + def extract(key): + return self._search_regex( + r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, + webpage, key, default=None, group='value') - return { - 'id': track_id, - 'title': data['title'], - 'thumbnail': thumbnail, - 'formats': formats, - 'duration': float_or_none(data.get('duration')), - 'track_number': int_or_none(data.get('track_num')), - } - else: - raise ExtractorError('No free songs found') + artist = extract('artist') + album = extract('album_title') + timestamp = unified_timestamp( + extract('publish_date') or extract('album_publish_date')) + release_date = unified_strdate(extract('album_release_date')) - download_link = m_download.group(1) - video_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'video id') + download_link = self._search_regex( + r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'download link', default=None, group='url') + if download_link: + track_id = self._search_regex( + r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', + webpage, 'track id') - download_webpage = self._download_webpage( - download_link, video_id, 'Downloading free downloads page') + download_webpage = self._download_webpage( + download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._parse_json( + self._search_regex( + r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, + 'blob', group='blob'), + track_id, transform_source=unescapeHTML) - info = blob['digital_items'][0] + info = try_get( + blob, (lambda x: x['digital_items'][0], + lambda x: x['download_items'][0]), dict) + if info: + downloads = info.get('downloads') + if isinstance(downloads, dict): + if not track: + track = info.get('title') + if not artist: + artist = info.get('artist') + if not thumbnail: + thumbnail = info.get('thumb_url') - downloads = info['downloads'] - track = info['title'] + download_formats = {} + download_formats_list = blob.get('download_formats') + if isinstance(download_formats_list, list): + for f in blob['download_formats']: + name, ext = f.get('name'), f.get('file_extension') + if all(isinstance(x, compat_str) for x in (name, ext)): + download_formats[name] = ext.strip('.') - artist = info.get('artist') - title = '%s - %s' % (artist, track) if artist else track + for format_id, f in downloads.items(): + format_url = f.get('url') + if not format_url: + continue + # Stat URL generation algorithm is reverse engineered from + # download_*_bundle_*.js + stat_url = update_url_query( + format_url.replace('/download/', '/statdownload/'), { + '.rand': int(time.time() * 1000 * random.random()), + }) + format_id = f.get('encoding_name') or format_id + stat = self._download_json( + stat_url, track_id, 'Downloading %s JSON' % format_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], + fatal=False) + if not stat: + continue + retry_url = url_or_none(stat.get('retry_url')) + if not retry_url: + continue + formats.append({ + 'url': self._proto_relative_url(retry_url, 'http:'), + 'ext': download_formats.get(format_id), + 'format_id': format_id, + 'format_note': f.get('description'), + 'filesize': parse_filesize(f.get('size_mb')), + 'vcodec': 'none', + }) - download_formats = {} - for f in blob['download_formats']: - name, ext = f.get('name'), f.get('file_extension') - if all(isinstance(x, compat_str) for x in (name, ext)): - download_formats[name] = ext.strip('.') - - formats = [] - for format_id, f in downloads.items(): - format_url = f.get('url') - if not format_url: - continue - # Stat URL generation algorithm is reverse engineered from - # download_*_bundle_*.js - stat_url = update_url_query( - format_url.replace('/download/', '/statdownload/'), { - '.rand': int(time.time() * 1000 * random.random()), - }) - format_id = f.get('encoding_name') or format_id - stat = self._download_json( - stat_url, video_id, 'Downloading %s JSON' % format_id, - transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], - fatal=False) - if not stat: - continue - retry_url = url_or_none(stat.get('retry_url')) - if not retry_url: - continue - formats.append({ - 'url': self._proto_relative_url(retry_url, 'http:'), - 'ext': download_formats.get(format_id), - 'format_id': format_id, - 'format_note': f.get('description'), - 'filesize': parse_filesize(f.get('size_mb')), - 'vcodec': 'none', - }) self._sort_formats(formats) + title = '%s - %s' % (artist, track) if artist else track + + if not duration: + duration = float_or_none(self._html_search_meta( + 'duration', webpage, default=None)) + return { - 'id': video_id, + 'id': track_id, 'title': title, - 'thumbnail': info.get('thumb_url') or thumbnail, - 'uploader': info.get('artist'), - 'artist': artist, + 'thumbnail': thumbnail, + 'uploader': artist, + 'timestamp': timestamp, + 'release_date': release_date, + 'duration': duration, 'track': track, + 'track_number': track_number, + 'track_id': track_id, + 'artist': artist, + 'album': album, 'formats': formats, } From c1a37eb24a78fe07a3ad41fc151389e444b7962b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 1 Sep 2018 00:18:17 +0200 Subject: [PATCH 412/488] [ard] Add support for Beta ARD Mediathek Thanks to https://blog.fefe.de/?ts=a577685d for pointing out support is missing. --- youtube_dl/extractor/ard.py | 62 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 63 insertions(+) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 23f574d36..9c6be2dd9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -282,3 +282,65 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } + + +class ARDBetaMediathekIE(InfoExtractor): + _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P<video_id>[a-zA-Z0-9]+)/(?P<display_id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', + 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', + 'info_dict': { + 'display_id': 'die-robuste-roswita', + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'title': 'Tatort: Die robuste Roswita', + 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'duration': 5316, + 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', + 'upload_date': '20180826', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);', webpage, 'json') + data = self._parse_json(data_json, display_id) + + res = { + 'id': video_id, + 'display_id': display_id, + } + formats = [] + for widget in data.values(): + if widget.get('_geoblocked'): + raise ExtractorError('This video is not available due to geoblocking', expected=True) + + if '_duration' in widget: + res['duration'] = widget['_duration'] + if 'clipTitle' in widget: + res['title'] = widget['clipTitle'] + if '_previewImage' in widget: + res['thumbnail'] = widget['_previewImage'] + if 'broadcastedOn' in widget: + res['upload_date'] = unified_strdate(widget['broadcastedOn']) + if 'synopsis' in widget: + res['description'] = widget['synopsis'] + if '_subtitleUrl' in widget: + res['subtitles'] = {'de': [{ + 'ext': 'ttml', + 'url': widget['_subtitleUrl'], + }]} + if '_quality' in widget: + formats.append({ + 'format_id': widget['_quality'], + 'url': widget['_stream']['json'][0], + }) + + self._sort_formats(formats) + res['formats'] = formats + + return res diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6012d0f5f..995af9988 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -54,6 +54,7 @@ from .appletrailers import ( from .archiveorg import ArchiveOrgIE from .arkena import ArkenaIE from .ard import ( + ARDBetaMediathekIE, ARDIE, ARDMediathekIE, ) From 2b83da246375e6b37884aad9fe22ea8e461e20b3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 1 Sep 2018 00:45:36 +0200 Subject: [PATCH 413/488] [ard] Better format handling Skip f4m, doesn't work (yet); correctly extract m3u8, and prefer plain HTTP files. --- youtube_dl/extractor/ard.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 9c6be2dd9..cff8ca4a5 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -335,10 +335,24 @@ class ARDBetaMediathekIE(InfoExtractor): 'url': widget['_subtitleUrl'], }]} if '_quality' in widget: - formats.append({ - 'format_id': widget['_quality'], - 'url': widget['_stream']['json'][0], - }) + format_url = widget['_stream']['json'][0] + + if format_url.endswith('.f4m'): + # Skip f4m - these URLs just return a 403 + formats.append({ + 'format_id': 'f4m-' + widget['_quality'], + 'url': format_url, + 'preference': -1001, + }) + elif format_url.endswith('m3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': 'http-' + widget['_quality'], + 'url': format_url, + 'preference': 10, # Plain HTTP, that's nice + }) self._sort_formats(formats) res['formats'] = formats From ed6919e7371b3da66c10e4c5768816e81f4c5db3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 1 Sep 2018 01:59:13 +0200 Subject: [PATCH 414/488] [ard] beta mediathek: make regexp for JSON more robust --- youtube_dl/extractor/ard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index cff8ca4a5..dcb347849 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -307,7 +307,7 @@ class ARDBetaMediathekIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);', webpage, 'json') + data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') data = self._parse_json(data_json, display_id) res = { From 54a5be4dba3560ff51f98865b5598d361a878e82 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 1 Sep 2018 08:16:28 +0100 Subject: [PATCH 415/488] [crunchyroll] parse vilos media data(closes #17343) --- youtube_dl/extractor/crunchyroll.py | 205 ++++++++++++++++------------ youtube_dl/extractor/vrv.py | 48 ++++--- 2 files changed, 141 insertions(+), 112 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 463f995c7..4ed458372 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -7,7 +7,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor -from .common import InfoExtractor +from .vrv import VRVIE from ..compat import ( compat_b64decode, compat_etree_fromstring, @@ -18,6 +18,8 @@ from ..compat import ( from ..utils import ( ExtractorError, bytes_to_intlist, + extract_attributes, + float_or_none, intlist_to_bytes, int_or_none, lowercase_escape, @@ -26,14 +28,13 @@ from ..utils import ( unified_strdate, urlencode_postdata, xpath_text, - extract_attributes, ) from ..aes import ( aes_cbc_decrypt, ) -class CrunchyrollBaseIE(InfoExtractor): +class CrunchyrollBaseIE(VRVIE): _LOGIN_URL = 'https://www.crunchyroll.com/login' _LOGIN_FORM = 'login_form' _NETRC_MACHINE = 'crunchyroll' @@ -148,7 +149,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'ext': 'mp4', 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', 'description': 'md5:2d17137920c64f2f49981a7797d275ef', - 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'upload_date': '20131013', 'url': 're:(?!.*&)', @@ -221,7 +222,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'info_dict': { 'id': '535080', 'ext': 'mp4', - 'title': '11eyes Episode 1 – Piros éjszaka - Red Night', + 'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka', 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', 'uploader': 'Marvelous AQL Inc.', 'upload_date': '20091021', @@ -437,13 +438,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() + media = self._parse_json(self._search_regex( + r'vilos\.config\.media\s*=\s*({.+?});', + webpage, 'vilos media', default='{}'), video_id) + media_metadata = media.get('metadata') or {} + video_title = self._html_search_regex( r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._parse_json(self._html_search_regex( + video_description = (self._parse_json(self._html_search_regex( r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, - webpage, 'description', default='{}'), video_id).get('description') + webpage, 'description', default='{}'), video_id) or media_metadata).get('description') if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( @@ -456,91 +462,99 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], webpage, 'video_uploader', fatal=False) - available_fmts = [] - for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): - attrs = extract_attributes(a) - href = attrs.get('href') - if href and '/freetrial' in href: - continue - available_fmts.append(fmt) - if not available_fmts: - for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): - available_fmts = re.findall(p, webpage) - if available_fmts: - break - video_encode_ids = [] formats = [] - for fmt in available_fmts: - stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt + 'p' - stream_infos = [] - streamdata = self._call_rpc_api( - 'VideoPlayer_GetStandardConfig', video_id, - 'Downloading media info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_quality': stream_quality, - 'current_page': url, - }) - if streamdata is not None: - stream_info = streamdata.find('./{default}preload/stream_info') + for stream in media.get('streams', []): + formats.extend(self._extract_vrv_formats( + stream.get('url'), video_id, stream.get('format'), + stream.get('audio_lang'), stream.get('hardsub_lang'))) + if not formats: + available_fmts = [] + for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break + if not available_fmts: + available_fmts = self._FORMAT_IDS.keys() + video_encode_ids = [] + + for fmt in available_fmts: + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt + 'p' + stream_infos = [] + streamdata = self._call_rpc_api( + 'VideoPlayer_GetStandardConfig', video_id, + 'Downloading media info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_quality': stream_quality, + 'current_page': url, + }) + if streamdata is not None: + stream_info = streamdata.find('./{default}preload/stream_info') + if stream_info is not None: + stream_infos.append(stream_info) + stream_info = self._call_rpc_api( + 'VideoEncode_GetStreamInfo', video_id, + 'Downloading stream info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_encode_quality': stream_quality, + }) if stream_info is not None: stream_infos.append(stream_info) - stream_info = self._call_rpc_api( - 'VideoEncode_GetStreamInfo', video_id, - 'Downloading stream info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_encode_quality': stream_quality, - }) - if stream_info is not None: - stream_infos.append(stream_info) - for stream_info in stream_infos: - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) + for stream_info in stream_infos: + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'format_id': 'http-' + video_format, - 'url': direct_video_url, - }) - formats.append(format_info) + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - format_info.update({ - 'format_id': 'rtmp-' + video_format, - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) + video_url = xpath_text(stream_info, './host') + if not video_url: + continue + metadata = stream_info.find('./metadata') + format_info = { + 'format': video_format, + 'height': int_or_none(xpath_text(metadata, './height')), + 'width': int_or_none(xpath_text(metadata, './width')), + } + + if '.fplive.net/' in video_url: + video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) + parsed_video_url = compat_urlparse.urlparse(video_url) + direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( + netloc='v.lvlt.crcdn.net', + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) + if self._is_valid_url(direct_video_url, video_id, video_format): + format_info.update({ + 'format_id': 'http-' + video_format, + 'url': direct_video_url, + }) + formats.append(format_info) + continue + + format_info.update({ + 'format_id': 'rtmp-' + video_format, + 'url': video_url, + 'play_path': video_file, + 'ext': 'flv', + }) + formats.append(format_info) self._sort_formats(formats, ('height', 'width', 'tbr', 'fps')) metadata = self._call_rpc_api( @@ -549,7 +563,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'media_id': video_id, }) - subtitles = self.extract_subtitles(video_id, webpage) + subtitles = {} + for subtitle in media.get('subtitles', []): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + if not subtitles: + subtitles = self.extract_subtitles(video_id, webpage) # webpage provide more accurate data than series_title from XML series = self._html_search_regex( @@ -557,8 +581,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'series', fatal=False) season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title') + episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number')) season_number = int_or_none(self._search_regex( r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', @@ -568,7 +592,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'id': video_id, 'title': video_title, 'description': video_description, - 'thumbnail': xpath_text(metadata, 'episode_image_url'), + 'duration': float_or_none(media_metadata.get('duration'), 1000), + 'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'), 'uploader': video_uploader, 'upload_date': video_upload_date, 'series': series, diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 64b13f0ed..921e9e172 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -72,7 +72,7 @@ class VRVBaseIE(InfoExtractor): class VRVIE(VRVBaseIE): IE_NAME = 'vrv' _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', 'info_dict': { 'id': 'GR9PNZ396', @@ -85,7 +85,28 @@ class VRVIE(VRVBaseIE): # m3u8 download 'skip_download': True, }, - } + }] + + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash'): + return [] + stream_id = hardsub_lang or audio_lang + format_id = '%s-%s' % (stream_format, stream_id) + if stream_format == 'hls': + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s MPD information' % stream_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats def _real_extract(self, url): video_id = self._match_id(url) @@ -115,26 +136,9 @@ class VRVIE(VRVBaseIE): for stream_type, streams in streams_json.get('streams', {}).items(): if stream_type in ('adaptive_hls', 'adaptive_dash'): for stream in streams.values(): - stream_url = stream.get('url') - if not stream_url: - continue - stream_id = stream.get('hardsub_locale') or audio_locale - format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) - if stream_type == 'adaptive_hls': - adaptive_formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s m3u8 information' % stream_id, - fatal=False) - else: - adaptive_formats = self._extract_mpd_formats( - stream_url, video_id, mpd_id=format_id, - note='Downloading %s MPD information' % stream_id, - fatal=False) - if audio_locale: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_locale - formats.extend(adaptive_formats) + formats.extend(self._extract_vrv_formats( + stream.get('url'), video_id, stream_type.split('_')[1], + audio_locale, stream.get('hardsub_locale'))) self._sort_formats(formats) subtitles = {} From 7f2611cb5b086fbdb6281779a96fcb75a594e68c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 1 Sep 2018 08:40:38 +0100 Subject: [PATCH 416/488] [ard] extract f4m formats --- youtube_dl/extractor/ard.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index dcb347849..194a369c0 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -338,12 +338,9 @@ class ARDBetaMediathekIE(InfoExtractor): format_url = widget['_stream']['json'][0] if format_url.endswith('.f4m'): - # Skip f4m - these URLs just return a 403 - formats.append({ - 'format_id': 'f4m-' + widget['_quality'], - 'url': format_url, - 'preference': -1001, - }) + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.11.0', + video_id, f4m_id='hds', fatal=False)) elif format_url.endswith('m3u8'): formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) From 462799588288b9ac8280dc361a9efcced8cb29e1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 1 Sep 2018 10:04:10 +0100 Subject: [PATCH 417/488] [crunchyroll] limit VRVIE inheritance to CrunchyrollIE --- youtube_dl/extractor/crunchyroll.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4ed458372..ba8b9fa7e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -7,6 +7,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor +from .common import InfoExtractor from .vrv import VRVIE from ..compat import ( compat_b64decode, @@ -34,7 +35,7 @@ from ..aes import ( ) -class CrunchyrollBaseIE(VRVIE): +class CrunchyrollBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.crunchyroll.com/login' _LOGIN_FORM = 'login_form' _NETRC_MACHINE = 'crunchyroll' @@ -140,7 +141,8 @@ class CrunchyrollBaseIE(VRVIE): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE): +class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): + IE_NAME = 'crunchyroll' _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', From 4d59db5b900fa8f1416cb33328f2f12daf9946c5 Mon Sep 17 00:00:00 2001 From: Gorfiend <Gorfiend@users.noreply.github.com> Date: Sat, 1 Sep 2018 05:04:45 -0400 Subject: [PATCH 418/488] [niconico] Fix extraction on python3 (closes #17393) --- youtube_dl/extractor/niconico.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index dbe871f16..76b412ff1 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -252,7 +252,7 @@ class NiconicoIE(InfoExtractor): }, 'timing_constraint': 'unlimited' } - })) + }).encode()) resolution = video_quality.get('resolution', {}) From 0b87e8845328fd93a6f0fd873f5394f63c9ce64f Mon Sep 17 00:00:00 2001 From: LangerJan <LangerJan@users.noreply.github.com> Date: Sat, 1 Sep 2018 11:42:30 +0200 Subject: [PATCH 419/488] [ard] Add support for one.ard.de --- youtube_dl/extractor/ard.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 194a369c0..6bf8f61eb 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -21,7 +21,7 @@ from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ # available till 26.07.2022 @@ -37,6 +37,9 @@ class ARDMediathekIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', + 'only_matching': True, }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', From 7bbc1b189aa7b0cea5336964a2acdca4001c1566 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Sep 2018 18:36:18 +0700 Subject: [PATCH 420/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ChangeLog b/ChangeLog index 49f44a6e6..7585f06e9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +version <unreleased> + +Core +* [utils] Skip remote IP addresses non matching to source address' IP version + when creating a connection (#13422, #17362) + +Extractors ++ [ard] Add support for one.ard.de (#17397) +* [niconico] Fix extraction on python3 (#17393, #17407) +* [ard] Extract f4m formats +* [crunchyroll] Parse vilos media data (#17343) ++ [ard] Add support for Beta ARD Mediathek ++ [bandcamp] Extract more metadata (#13197) +* [internazionale] Fix extraction of non-available-abroad videos (#17386) + + version 2018.08.28 Extractors From 27d8e089a21561480f5261f77665604bba2f5901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Sep 2018 18:40:23 +0700 Subject: [PATCH 421/488] release 2018.09.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 0816c4f5f..16d8f36d7 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.08.28*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.08.28** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.08.28 +[debug] youtube-dl version 2018.09.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7585f06e9..4632133a3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.09.01 Core * [utils] Skip remote IP addresses non matching to source address' IP version diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5beb9bc17..f0d6be901 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -56,6 +56,7 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** + - **ARDBetaMediathek** - **Arkena** - **arte.tv** - **arte.tv:+7** @@ -191,7 +192,7 @@ - **Crackle** - **Criterion** - **CrooksAndLiars** - - **Crunchyroll** + - **crunchyroll** - **crunchyroll:playlist** - **CSNNE** - **CSpan**: C-SPAN diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3e3fe1375..e655e0050 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.08.28' +__version__ = '2018.09.01' From 3d08f63dc57f000384703f26b7dcb4b683e18c05 Mon Sep 17 00:00:00 2001 From: Mohammed Yaseen Mowzer <yaseen@mowzer.co.za> Date: Thu, 14 Jun 2018 17:12:33 +0200 Subject: [PATCH 422/488] [generic] Skip unsuccessful jwplayer extraction (closes #16735) --- youtube_dl/extractor/generic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 229dfda1b..1db154c4f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3150,9 +3150,13 @@ class GenericIE(InfoExtractor): jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: - info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, base_url=url) - return merge_dicts(info, info_dict) + try: + info = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, base_url=url) + return merge_dicts(info, info_dict) + except ExtractorError: + # See https://github.com/rg3/youtube-dl/pull/16735 + pass # Video.js embed mobj = re.search( From 0a9a8118ce834c206434df04c18187934acbf608 Mon Sep 17 00:00:00 2001 From: Hormoz K <hkheradm@gmail.com> Date: Sat, 4 Aug 2018 09:47:58 -0400 Subject: [PATCH 423/488] [radiojavan] Fix extraction --- youtube_dl/extractor/radiojavan.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py index a53ad97a5..4124bcd45 100644 --- a/youtube_dl/extractor/radiojavan.py +++ b/youtube_dl/extractor/radiojavan.py @@ -6,11 +6,13 @@ from .common import InfoExtractor from ..utils import ( unified_strdate, str_to_int, + urlencode_postdata, ) class RadioJavanIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' + _HOST_TRACKER_URL = 'https://www.radiojavan.com/videos/video_host' _TEST = { 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', @@ -31,8 +33,18 @@ class RadioJavanIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + download_host = self._download_json( + self._HOST_TRACKER_URL, + video_id, + data=urlencode_postdata({'id': video_id}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + } + )['host'] + formats = [{ - 'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path, + 'url': '%s/%s' % (download_host, video_path), 'format_id': '%sp' % height, 'height': int(height), } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)] From 93284ff2ea4eb84c25b8d496012398a056ba89ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Sep 2018 02:53:26 +0700 Subject: [PATCH 424/488] [radiojavan] Improve extraction (closes #17151) --- youtube_dl/extractor/radiojavan.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py index 4124bcd45..3f74f0c01 100644 --- a/youtube_dl/extractor/radiojavan.py +++ b/youtube_dl/extractor/radiojavan.py @@ -4,15 +4,16 @@ import re from .common import InfoExtractor from ..utils import ( - unified_strdate, + parse_resolution, str_to_int, + unified_strdate, urlencode_postdata, + urljoin, ) class RadioJavanIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' - _HOST_TRACKER_URL = 'https://www.radiojavan.com/videos/video_host' _TEST = { 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', @@ -31,23 +32,26 @@ class RadioJavanIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - download_host = self._download_json( - self._HOST_TRACKER_URL, - video_id, + 'https://www.radiojavan.com/videos/video_host', video_id, data=urlencode_postdata({'id': video_id}), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': url, - } - )['host'] + }).get('host', 'https://host1.rjmusicmedia.com') - formats = [{ - 'url': '%s/%s' % (download_host, video_path), - 'format_id': '%sp' % height, - 'height': int(height), - } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)] + webpage = self._download_webpage(url, video_id) + + formats = [] + for format_id, _, video_path in re.findall( + r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + f = parse_resolution(format_id) + f.update({ + 'url': urljoin(download_host, video_path), + 'format_id': format_id, + }) + formats.append(f) self._sort_formats(formats) title = self._og_search_title(webpage) From aa1d5eb9053b4c39661415ef411fb3746ee11f41 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 4 Sep 2018 10:37:11 +0100 Subject: [PATCH 425/488] [slideslive] make the check for video_service_name case-insensitive(closes #17429) --- youtube_dl/extractor/slideslive.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py index 104576033..ed84322c5 100644 --- a/youtube_dl/extractor/slideslive.py +++ b/youtube_dl/extractor/slideslive.py @@ -8,6 +8,7 @@ from ..utils import ExtractorError class SlidesLiveIE(InfoExtractor): _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' _TESTS = [{ + # video_service_name = YOUTUBE 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', 'info_dict': { @@ -19,14 +20,18 @@ class SlidesLiveIE(InfoExtractor): 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', 'upload_date': '20170925', } + }, { + # video_service_name = youtube + 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( url, video_id, headers={'Accept': 'application/json'}) - service_name = video_data['video_service_name'] - if service_name == 'YOUTUBE': + service_name = video_data['video_service_name'].lower() + if service_name == 'youtube': yt_video_id = video_data['video_service_id'] return self.url_result(yt_video_id, 'Youtube', video_id=yt_video_id) else: From 09322cccdb655954c63724459556bd3ebb585d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Sep 2018 00:22:30 +0700 Subject: [PATCH 426/488] [iprima] Confirm adult check (closes #17437) --- youtube_dl/extractor/iprima.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index a29e6a5ba..3c4b7e48b 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -38,6 +38,8 @@ class IPrimaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + self._set_cookie('play.iprima.cz', 'ott_adult_confirmed', '1') + webpage = self._download_webpage(url, video_id) video_id = self._search_regex(r'data-product="([^"]+)">', webpage, 'real id') From 2d4fe594c64249c263d739c8094b445c67fa09f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Sep 2018 00:51:20 +0700 Subject: [PATCH 427/488] [pornhub:uservideos] Add support for new URLs (closes #17388) --- youtube_dl/extractor/pornhub.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index ffc4405a8..6782848d9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -254,7 +254,7 @@ class PornHubIE(InfoExtractor): self._sort_formats(formats) video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) view_count = self._extract_count( @@ -346,7 +346,7 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -378,6 +378,12 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/model/jayndrea/videos/upload', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, }] def _real_extract(self, url): From 9a47fa35dd9dd2d53f4d3f088811ea29295991e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 03:36:10 +0700 Subject: [PATCH 428/488] [youtube] Fix extraction (closes #17457, closes #17464) --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0442906df..27047425d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1178,7 +1178,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\('), + r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) From a41a506077936b2574cbb34afa83f8215accc3a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 03:40:06 +0700 Subject: [PATCH 429/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4632133a3..b378f0a0e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Extractors +* [youtube] Fix extraction (#17457, #17464) ++ [pornhub:uservideos] Add support for new URLs (#17388) +* [iprima] Confirm adult check (#17437) +* [slideslive] Make check for video service name case-insensitive (#17429) +* [radiojavan] Fix extraction (#17151) +* [generic] Skip unsuccessful jwplayer extraction (#16735) + + version 2018.09.01 Core From ad98d2eb748b593dc044296493904607c3058e8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 03:42:28 +0700 Subject: [PATCH 430/488] release 2018.09.08 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 16d8f36d7..2d67247e6 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.08*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.08** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.09.01 +[debug] youtube-dl version 2018.09.08 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index b378f0a0e..ac95c9b71 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.09.08 Extractors * [youtube] Fix extraction (#17457, #17464) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e655e0050..716d9ffe0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.09.01' +__version__ = '2018.09.08' From d0c5fabc1215fddd354fd91cd65082e45df1165e Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Sat, 8 Sep 2018 09:44:06 +0200 Subject: [PATCH 431/488] [nbc] Fix extraction of percent encoded URLs (closes #17374) --- youtube_dl/extractor/nbc.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index c843f8649..765c46fd2 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from .theplatform import ThePlatformIE from .adobepass import AdobePassIE +from ..compat import compat_urllib_parse_unquote from ..utils import ( find_xpath_attr, smuggle_url, @@ -75,11 +76,16 @@ class NBCIE(AdobePassIE): 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', 'only_matching': True, }, + { + # Percent escaped url + 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189', + 'only_matching': True, + } ] def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() - permalink = 'http' + permalink + permalink = 'http' + compat_urllib_parse_unquote(permalink) response = self._download_json( 'https://api.nbc.com/v3/videos', video_id, query={ 'filter[permalink]': permalink, From d0de6a287ac64b52630e5b21c3db52a4312675fd Mon Sep 17 00:00:00 2001 From: Jens Rutschmann <N/A> Date: Sat, 1 Sep 2018 19:43:34 +0200 Subject: [PATCH 432/488] [tele5] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tele5.py | 39 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 youtube_dl/extractor/tele5.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 995af9988..7dc569724 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1086,6 +1086,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tele5 import Tele5IE from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py new file mode 100644 index 000000000..cfa8d2475 --- /dev/null +++ b/youtube_dl/extractor/tele5.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .nexx import NexxIE + + +class Tele5IE(InfoExtractor): + _VALID_URL = r'https://www\.tele5\.de/(?:mediathek/filme-online/videos\?vid=|tv/)(?P<display_id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1550589', + 'info_dict': { + 'id': '1550589', + 'ext': 'mp4', + 'upload_date': '20180822', + 'timestamp': 1534927316, + 'title': 'SchleFaZ: Atomic Shark', + } + }, { + 'url': 'https://www.tele5.de/tv/dark-matter/videos', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + video_id = self._html_search_regex( + r'id\s*=\s*["\']video-player["\']\s*data-id\s*=\s*["\']([0-9]+)["\']', + webpage, 'video_id') + + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, + ie=NexxIE.ie_key(), video_id=video_id) From ae2384ff5f9d6a18209139a4b2e2cd33921be63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 16:04:39 +0700 Subject: [PATCH 433/488] [tele5] Improve extraction (closes #7805, closes #7922, closes #17331, closes #17414) --- youtube_dl/extractor/tele5.py | 39 ++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index cfa8d2475..25573e49f 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,38 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from .nexx import NexxIE +from ..compat import compat_urlparse class Tele5IE(InfoExtractor): - _VALID_URL = r'https://www\.tele5\.de/(?:mediathek/filme-online/videos\?vid=|tv/)(?P<display_id>[\w-]+)' - + _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:mediathek|tv)/(?P<id>[^?#&]+)' _TESTS = [{ - 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1550589', + 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', 'info_dict': { - 'id': '1550589', + 'id': '1549416', 'ext': 'mp4', - 'upload_date': '20180822', - 'timestamp': 1534927316, - 'title': 'SchleFaZ: Atomic Shark', - } + 'upload_date': '20180814', + 'timestamp': 1534290623, + 'title': 'Pandorum', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.tele5.de/tv/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', + 'only_matching': True, }, { 'url': 'https://www.tele5.de/tv/dark-matter/videos', 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - webpage = self._download_webpage(url, display_id) - - video_id = self._html_search_regex( - r'id\s*=\s*["\']video-player["\']\s*data-id\s*=\s*["\']([0-9]+)["\']', - webpage, 'video_id') + if not video_id: + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', + webpage, 'video id') return self.url_result( 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, From 6f9f3340bb4c837d86067856e12232fc9695e698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 17:24:09 +0700 Subject: [PATCH 434/488] [dtube] PEP 8 (#17455) --- youtube_dl/extractor/dtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py index 4ca97f860..5887887e1 100644 --- a/youtube_dl/extractor/dtube.py +++ b/youtube_dl/extractor/dtube.py @@ -59,7 +59,7 @@ class DTubeIE(InfoExtractor): try: self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) self._downloader._opener.open(video_url, timeout=5).close() - except timeout as e: + except timeout: self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, format_id)) continue From 13ef64fd9340794e214ba64c258fa0999fbd0a17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 17:24:34 +0700 Subject: [PATCH 435/488] [motherless] PEP 8 (#17455) --- youtube_dl/extractor/motherless.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index bed5645f2..d4bd273b6 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -167,9 +167,9 @@ class MotherlessGroupIE(InfoExtractor): if not entries: entries = [ self.url_result( - compat_urlparse.urljoin(base, '/' + video_id), - ie=MotherlessIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( + compat_urlparse.urljoin(base, '/' + entry_id), + ie=MotherlessIE.ie_key(), video_id=entry_id) + for entry_id in orderedSet(re.findall( r'data-codename=["\']([A-Z0-9]+)', webpage))] return entries From 2c9d3b9962e28b61615e382bfcaf6db6d6987c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 17:24:48 +0700 Subject: [PATCH 436/488] [seznamzpravy] PEP 8 (#17455) --- youtube_dl/extractor/seznamzpravy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 6d4e3b76d..7a1c7e38b 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -164,6 +164,6 @@ class SeznamZpravyArticleIE(InfoExtractor): description = info.get('description') or self._og_search_description(webpage) return self.playlist_result([ - self.url_result(url, ie=SeznamZpravyIE.ie_key()) - for url in SeznamZpravyIE._extract_urls(webpage)], + self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) + for entry_url in SeznamZpravyIE._extract_urls(webpage)], article_id, title, description) From 2e4350eec6981ffb12617abda7623e4fb565ce8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Sep 2018 17:24:59 +0700 Subject: [PATCH 437/488] [generic] PEP 8 (#17455) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1db154c4f..76ef01332 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3112,7 +3112,7 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - sharevideos_urls = [mobj.group('url') for mobj in re.finditer( + sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] if sharevideos_urls: From a2637a2dda5e90b83bd2f403d06ad1ce21fc8c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 10 Sep 2018 01:34:49 +0700 Subject: [PATCH 438/488] [iprima] Add support for prima.iprima.cz (closes #17514) --- youtube_dl/extractor/iprima.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 3c4b7e48b..1d58d6e85 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -12,7 +12,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://play\.iprima\.cz/(?:.+/)?(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:play|prima)\.iprima\.cz/(?:.+/)?(?P<id>[^?#]+)' _GEO_BYPASS = False _TESTS = [{ @@ -33,6 +33,14 @@ class IPrimaIE(InfoExtractor): # geo restricted 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', 'only_matching': True, + }, { + # iframe api.play-backend.iprima.cz + 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', + 'only_matching': True, + }, { + # iframe prima.iprima.cz + 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', + 'only_matching': True, }] def _real_extract(self, url): @@ -42,7 +50,10 @@ class IPrimaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_id = self._search_regex(r'data-product="([^"]+)">', webpage, 'real id') + video_id = self._search_regex( + (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)', + r'data-product="([^"]+)">'), + webpage, 'real id') playerpage = self._download_webpage( 'http://play.iprima.cz/prehravac/init', From 25d110be30b92f785617140b0617a73d8eec5f7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 10 Sep 2018 02:37:22 +0700 Subject: [PATCH 439/488] [utils] Properly recognize AV1 codec (closes #17506) --- test/test_utils.py | 4 ++++ youtube_dl/utils.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 8da5ccc56..9e28e008f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -785,6 +785,10 @@ class TestUtil(unittest.TestCase): 'vcodec': 'h264', 'acodec': 'aac', }) + self.assertEqual(parse_codecs('av01.0.05M.08'), { + 'vcodec': 'av01.0.05M.08', + 'acodec': 'none', + }) def test_escape_rfc3986(self): reserved = "!*'();:@&=+$,/?#[]" diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bcfb72d43..e84d35d4d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2477,7 +2477,7 @@ def parse_codecs(codecs_str): vcodec, acodec = None, None for full_codec in splited_codecs: codec = full_codec.split('.')[0] - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'): + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'): if not vcodec: vcodec = full_codec elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): From 14f577e31c3d11e84bda5048d11bc7c31a830e2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 10 Sep 2018 02:45:44 +0700 Subject: [PATCH 440/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index ac95c9b71..c02450bf4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Core ++ [utils] Properly recognize AV1 codec (#17506) + +Extractors ++ [iprima] Add support for prima.iprima.cz (#17514) ++ [tele5] Add support for tele5.de (#7805, #7922, #17331, #17414) +* [nbc] Fix extraction of percent encoded URLs (#17374) + + version 2018.09.08 Extractors From 8476b4fd91ce71c052a3bdea9e010e5196331606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 10 Sep 2018 02:48:37 +0700 Subject: [PATCH 441/488] release 2018.09.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 2d67247e6..f41266f32 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.08*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.08** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.09.08 +[debug] youtube-dl version 2018.09.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index c02450bf4..d184f69ee 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.09.10 Core + [utils] Properly recognize AV1 codec (#17506) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f0d6be901..9b8601751 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -847,6 +847,7 @@ - **techtv.mit.edu** - **ted** - **Tele13** + - **Tele5** - **TeleBruxelles** - **Telecinco**: telecinco.es, cuatro.com and mediaset.es - **Telegraaf** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 716d9ffe0..b078c4993 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.09.08' +__version__ = '2018.09.10' From 96dbf70de63c08fc27e9bd7bec97670325225758 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 11 Sep 2018 02:24:32 +0700 Subject: [PATCH 442/488] [eporner] Extract JSON-LD (closes #17519) --- youtube_dl/extractor/eporner.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 6d03d7095..c050bf9df 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -9,6 +9,7 @@ from ..utils import ( encode_base_n, ExtractorError, int_or_none, + merge_dicts, parse_duration, str_to_int, url_or_none, @@ -25,10 +26,16 @@ class EpornerIE(InfoExtractor): 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', + 'description': 'md5:764f39abf932daafa37485eb46efa152', + 'timestamp': 1232520922, + 'upload_date': '20090121', 'duration': 1838, 'view_count': int, 'age_limit': 18, }, + 'params': { + 'proxy': '127.0.0.1:8118' + } }, { # New (May 2016) URL layout 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', @@ -104,12 +111,15 @@ class EpornerIE(InfoExtractor): }) self._sort_formats(formats) - duration = parse_duration(self._html_search_meta('duration', webpage)) + json_ld = self._search_json_ld(webpage, display_id, default={}) + + duration = parse_duration(self._html_search_meta( + 'duration', webpage, default=None)) view_count = str_to_int(self._search_regex( r'id="cinemaviews">\s*([0-9,]+)\s*<small>views', webpage, 'view count', fatal=False)) - return { + return merge_dicts(json_ld, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -117,4 +127,4 @@ class EpornerIE(InfoExtractor): 'view_count': view_count, 'formats': formats, 'age_limit': 18, - } + }) From 79facb27735baf27d65baf050121d68e26654a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 11 Sep 2018 02:29:45 +0700 Subject: [PATCH 443/488] [tube8] Fix metadata extraction (closes #17520) --- youtube_dl/extractor/tube8.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 368c45729..db93b0182 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -45,7 +45,7 @@ class Tube8IE(KeezMoviesIE): r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( - r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False) + r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'<span class="username">\s*(.+?)\s*<', webpage, 'uploader', fatal=False) @@ -55,19 +55,19 @@ class Tube8IE(KeezMoviesIE): dislike_count = int_or_none(self._search_regex( r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) view_count = str_to_int(self._search_regex( - r'<strong>Views: </strong>([\d,\.]+)\s*</li>', + r'Views:\s*</dt>\s*<dd>([\d,\.]+)', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._search_regex( r'<span id="allCommentsCount">(\d+)</span>', webpage, 'comment count', fatal=False)) category = self._search_regex( - r'Category:\s*</strong>\s*<a[^>]+href=[^>]+>([^<]+)', + r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)', webpage, 'category', fatal=False) categories = [category] if category else None tags_str = self._search_regex( - r'(?s)Tags:\s*</strong>(.+?)</(?!a)', + r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)', webpage, 'tags', fatal=False) tags = [t for t in re.findall( r'<a[^>]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None From f5b0175349aa85a1b366d12a487ebd66aec5e1b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 11 Sep 2018 02:41:05 +0700 Subject: [PATCH 444/488] [vzaar] Add support for HLS --- youtube_dl/extractor/vzaar.py | 60 +++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py index 02fcd52c7..6000671c3 100644 --- a/youtube_dl/extractor/vzaar.py +++ b/youtube_dl/extractor/vzaar.py @@ -4,15 +4,19 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, float_or_none, + unified_timestamp, + url_or_none, ) class VzaarIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P<id>\d+)' _TESTS = [{ + # HTTP and HLS 'url': 'https://vzaar.com/videos/1152805', 'md5': 'bde5ddfeb104a6c56a93a06b04901dbf', 'info_dict': { @@ -40,24 +44,48 @@ class VzaarIE(InfoExtractor): video_id = self._match_id(url) video_data = self._download_json( 'http://view.vzaar.com/v2/%s/video' % video_id, video_id) - source_url = video_data['sourceUrl'] - info = { + title = video_data['videoTitle'] + + formats = [] + + source_url = url_or_none(video_data.get('sourceUrl')) + if source_url: + f = { + 'url': source_url, + 'format_id': 'http', + } + if 'audio' in source_url: + f.update({ + 'vcodec': 'none', + 'ext': 'mp3', + }) + else: + f.update({ + 'width': int_or_none(video_data.get('width')), + 'height': int_or_none(video_data.get('height')), + 'ext': 'mp4', + 'fps': float_or_none(video_data.get('fps')), + }) + formats.append(f) + + video_guid = video_data.get('guid') + usp = video_data.get('usp') + if isinstance(video_guid, compat_str) and isinstance(usp, dict): + m3u8_url = ('http://fable.vzaar.com/v4/usp/%s/%s.ism/.m3u8?' + % (video_guid, video_id)) + '&'.join( + '%s=%s' % (k, v) for k, v in usp.items()) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + return { 'id': video_id, - 'title': video_data['videoTitle'], - 'url': source_url, + 'title': title, 'thumbnail': self._proto_relative_url(video_data.get('poster')), 'duration': float_or_none(video_data.get('videoDuration')), + 'timestamp': unified_timestamp(video_data.get('ts')), + 'formats': formats, } - if 'audio' in source_url: - info.update({ - 'vcodec': 'none', - 'ext': 'mp3', - }) - else: - info.update({ - 'width': int_or_none(video_data.get('width')), - 'height': int_or_none(video_data.get('height')), - 'ext': 'mp4', - }) - return info From db348e88495bd87adcf058bb6c0a26c1fc591f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 14 Sep 2018 23:26:38 +0700 Subject: [PATCH 445/488] [twitch:clips] Extend _VALID_URL (closes #17559) --- youtube_dl/extractor/twitch.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index b39972b1e..26661fdf6 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -559,7 +559,8 @@ class TwitchStreamIE(TwitchBaseIE): TwitchAllVideosIE, TwitchUploadsIE, TwitchPastBroadcastsIE, - TwitchHighlightsIE)) + TwitchHighlightsIE, + TwitchClipsIE)) else super(TwitchStreamIE, cls).suitable(url)) def _real_extract(self, url): @@ -633,7 +634,7 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -653,6 +654,9 @@ class TwitchClipsIE(TwitchBaseIE): # multiple formats 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', + 'only_matching': True, }] def _real_extract(self, url): From 0f2aa0dcaa5ebffbca781f8ecd15c4d52a80f011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 14 Sep 2018 23:56:03 +0700 Subject: [PATCH 446/488] [asiancrush] Fix extraction (closes #15630) --- youtube_dl/extractor/asiancrush.py | 42 ++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py index 594c88c9c..6d71c5ad5 100644 --- a/youtube_dl/extractor/asiancrush.py +++ b/youtube_dl/extractor/asiancrush.py @@ -8,7 +8,6 @@ from .kaltura import KalturaIE from ..utils import ( extract_attributes, remove_end, - urlencode_postdata, ) @@ -34,19 +33,40 @@ class AsianCrushIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'https://www.asiancrush.com/wp-admin/admin-ajax.php', video_id, - data=urlencode_postdata({ - 'postid': video_id, - 'action': 'get_channel_kaltura_vars', - })) + webpage = self._download_webpage(url, video_id) - entry_id = data['entry_id'] + entry_id, partner_id, title = [None] * 3 + + vars = self._parse_json( + self._search_regex( + r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', + default='{}'), video_id, fatal=False) + if vars: + entry_id = vars.get('entry_id') + partner_id = vars.get('partner_id') + title = vars.get('vid_label') + + if not entry_id: + entry_id = self._search_regex( + r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') + + player = self._download_webpage( + 'https://api.asiancrush.com/embeddedVideoPlayer', video_id, + query={'id': entry_id}) + + kaltura_id = self._search_regex( + r'entry_id["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', player, + 'kaltura id', group='id') + + if not partner_id: + partner_id = self._search_regex( + r'/p(?:artner_id)?/(\d+)', player, 'partner id', + default='513551') return self.url_result( - 'kaltura:%s:%s' % (data['partner_id'], entry_id), - ie=KalturaIE.ie_key(), video_id=entry_id, - video_title=data.get('vid_label')) + 'kaltura:%s:%s' % (partner_id, kaltura_id), + ie=KalturaIE.ie_key(), video_id=kaltura_id, + video_title=title) class AsianCrushPlaylistIE(InfoExtractor): From 15bf2ca0da0ae94dbc6c3b76c89910ceaea88bd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Sep 2018 00:42:27 +0700 Subject: [PATCH 447/488] [porntube] Fix extraction (closes #17541) --- youtube_dl/extractor/fourtube.py | 94 ++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index ad273a0e7..20391b2bb 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,15 +3,44 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_b64decode, + compat_str, + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import ( + int_or_none, parse_duration, parse_iso8601, str_to_int, + try_get, + unified_timestamp, + url_or_none, ) class FourTubeBaseIE(InfoExtractor): + _TKN_HOST = 'tkn.kodicdn.com' + + def _extract_formats(self, url, video_id, media_id, sources): + token_url = 'https://%s/%s/desktop/%s' % ( + self._TKN_HOST, media_id, '+'.join(sources)) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) + formats = [{ + 'url': tokens[format]['token'], + 'format_id': format + 'p', + 'resolution': format + 'p', + 'quality': int(format), + } for format in sources] + self._sort_formats(formats) + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') @@ -68,21 +97,7 @@ class FourTubeBaseIE(InfoExtractor): media_id = params[0] sources = ['%s' % p for p in params[2]] - token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format( - media_id, '+'.join(sources)) - - parsed_url = compat_urlparse.urlparse(url) - tokens = self._download_json(token_url, video_id, data=b'', headers={ - 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), - 'Referer': url, - }) - formats = [{ - 'url': tokens[format]['token'], - 'format_id': format + 'p', - 'resolution': format + 'p', - 'quality': int(format), - } for format in sources] - self._sort_formats(formats) + formats = self._extract_formats(url, video_id, media_id, sources) return { 'id': video_id, @@ -164,6 +179,7 @@ class FuxIE(FourTubeBaseIE): class PornTubeIE(FourTubeBaseIE): _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TKN_HOST = 'tkn.porntube.com' _TESTS = [{ 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', 'info_dict': { @@ -171,13 +187,12 @@ class PornTubeIE(FourTubeBaseIE): 'ext': 'mp4', 'title': 'Teen couple doing anal', 'uploader': 'Alexy', - 'uploader_id': 'Alexy', + 'uploader_id': '91488', 'upload_date': '20150606', 'timestamp': 1433595647, 'duration': 5052, 'view_count': int, 'like_count': int, - 'categories': list, 'age_limit': 18, }, 'params': { @@ -191,6 +206,49 @@ class PornTubeIE(FourTubeBaseIE): 'only_matching': True, }] + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'INITIALSTATE\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'data', group='value'), video_id, + transform_source=lambda x: compat_urllib_parse_unquote( + compat_b64decode(x).decode('utf-8')))['page']['video'] + + title = video['title'] + media_id = video['mediaId'] + sources = [compat_str(e['height']) + for e in video['encodings'] if e.get('height')] + formats = self._extract_formats(url, video_id, media_id, sources) + + thumbnail = url_or_none(video.get('masterThumb')) + uploader = try_get(video, lambda x: x['user']['username'], compat_str) + uploader_id = compat_str(try_get(video, lambda x: x['user']['id'], int)) + like_count = int_or_none(video.get('likes')) + dislike_count = int_or_none(video.get('dislikes')) + view_count = int_or_none(video.get('playsQty')) + duration = int_or_none(video.get('durationInSeconds')) + timestamp = unified_timestamp(video.get('publishedAt')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'timestamp': timestamp, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'view_count': view_count, + 'duration': duration, + 'age_limit': 18, + } + class PornerBrosIE(FourTubeBaseIE): _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' From 6f1f59f39cf934d5ccae3ca2e6fd4eaab726137f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Sep 2018 01:23:36 +0700 Subject: [PATCH 448/488] [extractor/common] Introduce channel meta fields --- youtube_dl/extractor/common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b8bbaf81a..8eab5947f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -211,6 +211,11 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may noy repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and From dd4c449219976ad6e80fef74a3431c5553cf5031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Sep 2018 01:24:26 +0700 Subject: [PATCH 449/488] [youtube] Extract channel meta fields (closes #9676, closes #12939) --- youtube_dl/extractor/youtube.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27047425d..2fe074cb4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -490,6 +490,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', @@ -1907,6 +1909,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') + channel_id = self._html_search_meta( + 'channelId', video_webpage, 'channel id') + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', @@ -2078,6 +2084,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, 'upload_date': upload_date, 'license': video_license, 'creator': video_creator or artist, From d03beddf0f0b460d1b58c98173e2d4620d1d11b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Sep 2018 01:24:48 +0700 Subject: [PATCH 450/488] [vimeo] Extract channel meta fields --- youtube_dl/extractor/vimeo.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index e49b233f2..95d368cc1 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -299,10 +299,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', + 'channel_id': 'keypeele', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/keypeele', 'timestamp': 1380339469, 'upload_date': '20130928', 'duration': 187, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/76979871', @@ -355,11 +358,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/channels/tributes/6213729', 'info_dict': { 'id': '6213729', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Vimeo Tribute: The Shining', 'uploader': 'Casey Donahue', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', 'uploader_id': 'caseydonahue', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/tributes', + 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', @@ -465,6 +470,9 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -652,6 +660,8 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') + channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None + info_dict = { 'id': video_id, 'formats': formats, @@ -662,6 +672,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, + 'channel_id': channel_id, + 'channel_url': channel_url, } info_dict = merge_dicts(info_dict, info_dict_config, json_ld) From 127103b643c5105bd361c5e3be267ac40843ccdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Sep 2018 01:25:32 +0700 Subject: [PATCH 451/488] [porntube] Extract channel meta fields --- youtube_dl/extractor/fourtube.py | 33 +++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 20391b2bb..a9a1f911e 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -13,6 +13,7 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, + str_or_none, str_to_int, try_get, unified_timestamp, @@ -198,6 +199,26 @@ class PornTubeIE(FourTubeBaseIE): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.porntube.com/videos/squirting-teen-ballerina-ecg_1331406', + 'info_dict': { + 'id': '1331406', + 'ext': 'mp4', + 'title': 'Squirting Teen Ballerina on ECG', + 'uploader': 'Exploited College Girls', + 'uploader_id': '665', + 'channel': 'Exploited College Girls', + 'channel_id': '665', + 'upload_date': '20130920', + 'timestamp': 1379685485, + 'duration': 851, + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.porntube.com/embed/7089759', 'only_matching': True, @@ -227,7 +248,11 @@ class PornTubeIE(FourTubeBaseIE): thumbnail = url_or_none(video.get('masterThumb')) uploader = try_get(video, lambda x: x['user']['username'], compat_str) - uploader_id = compat_str(try_get(video, lambda x: x['user']['id'], int)) + uploader_id = str_or_none(try_get( + video, lambda x: x['user']['id'], int)) + channel = try_get(video, lambda x: x['channel']['name'], compat_str) + channel_id = str_or_none(try_get( + video, lambda x: x['channel']['id'], int)) like_count = int_or_none(video.get('likes')) dislike_count = int_or_none(video.get('dislikes')) view_count = int_or_none(video.get('playsQty')) @@ -239,8 +264,10 @@ class PornTubeIE(FourTubeBaseIE): 'title': title, 'formats': formats, 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'uploader': uploader or channel, + 'uploader_id': uploader_id or channel_id, + 'channel': channel, + 'channel_id': channel_id, 'timestamp': timestamp, 'like_count': like_count, 'dislike_count': dislike_count, From 0e7b8d3eaca710fbae8aa5531e28a6b6f05b387a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Sep 2018 01:53:01 +0700 Subject: [PATCH 452/488] [extractor/common] Fix typos --- youtube_dl/extractor/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8eab5947f..2dbf81e6e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -212,7 +212,7 @@ class InfoExtractor(object): uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. channel: Full name of the channel the video is uploaded on. - Note that channel fields may or may noy repeat uploader + Note that channel fields may or may not repeat uploader fields. This depends on a particular extractor. channel_id: Id of the channel. channel_url: Full URL to a channel webpage. @@ -1706,9 +1706,9 @@ class InfoExtractor(object): # However, this is not always respected, for example, [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite - # referencing audio group an audio group, it represents - # a complete (with audio and video) format. So, for such cases - # we will ignore references to rendition groups and treat them + # referencing an audio group it represents a complete + # (with audio and video) format. So, for such cases we will + # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': audio_group = groups.get(audio_group_id) From 3661ebf2b6e0cb951dca5b4498a1d6885cbf7b56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Sep 2018 02:04:43 +0700 Subject: [PATCH 453/488] [pornhub] Extract upload date (closes #17574) --- youtube_dl/extractor/pornhub.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 6782848d9..19eaf389f 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -40,6 +40,7 @@ class PornHubIE(InfoExtractor): 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', + 'upload_date': '20130628', 'duration': 361, 'view_count': int, 'like_count': int, @@ -57,6 +58,7 @@ class PornHubIE(InfoExtractor): 'ext': 'mp4', 'title': '重庆婷婷女王足交', 'uploader': 'Unknown', + 'upload_date': '20150213', 'duration': 1753, 'view_count': int, 'like_count': int, @@ -237,8 +239,14 @@ class PornHubIE(InfoExtractor): video_urls.append((video_url, None)) video_urls_set.add(video_url) + upload_date = None formats = [] for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') tbr = None mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) if mobj: @@ -278,6 +286,7 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, 'uploader': video_uploader, + 'upload_date': upload_date, 'title': title, 'thumbnail': thumbnail, 'duration': duration, From 8b40c92724453995701b75559b190075d6dbfd7d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 15 Sep 2018 06:30:57 +0100 Subject: [PATCH 454/488] [vimeo] redirect to feature url only in the case of a trailer(closes #14591) --- youtube_dl/extractor/vimeo.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 95d368cc1..0a9239b62 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -571,19 +571,23 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(redirect_url, video_id) + vod = config.get('video', {}).get('vod', {}) + def is_rented(): if '>You rented this title.<' in webpage: return True if config.get('user', {}).get('purchased'): return True - label = try_get( - config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str) - if label and label.startswith('You rented this'): - return True + for purchase_option in vod.get('purchase_options', []): + if purchase_option.get('purchased'): + return True + label = purchase_option.get('label_string') + if label and (label.startswith('You rented this') or label.endswith(' remaining')): + return True return False - if is_rented(): - feature_id = config.get('video', {}).get('vod', {}).get('feature_id') + if is_rented() and vod.get('is_trailer'): + feature_id = vod.get('feature_id') if feature_id and not data.get('force_feature_id', False): return self.url_result(smuggle_url( 'https://player.vimeo.com/player/%s' % feature_id, From d9b0d118adfa365e8ed67de277f5e83e29c4635c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Sep 2018 23:52:27 +0700 Subject: [PATCH 455/488] [vrv] Make format ids deterministic --- youtube_dl/extractor/vrv.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 921e9e172..ac0819c7c 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -90,7 +90,13 @@ class VRVIE(VRVBaseIE): def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): if not url or stream_format not in ('hls', 'dash'): return [] - stream_id = hardsub_lang or audio_lang + assert audio_lang or hardsub_lang + stream_id_list = [] + if audio_lang: + stream_id_list.append('audio-%s' % audio_lang) + if hardsub_lang: + stream_id_list.append('hardsub-%s' % hardsub_lang) + stream_id = '-'.join(stream_id_list) format_id = '%s-%s' % (stream_format, stream_id) if stream_format == 'hls': adaptive_formats = self._extract_m3u8_formats( From 1084563eaab3a039ea8ccb65f2507e919d8e11f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Sep 2018 23:54:25 +0700 Subject: [PATCH 456/488] [crunchyroll] Prefer hardsubless formats and formats in locale language --- youtube_dl/extractor/crunchyroll.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index ba8b9fa7e..af786d096 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -445,6 +445,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'vilos media', default='{}'), video_id) media_metadata = media.get('metadata') or {} + language = self._search_regex( + r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1', + webpage, 'language', default=None, group='lang') + video_title = self._html_search_regex( r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', webpage, 'video_title') @@ -466,9 +470,22 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text formats = [] for stream in media.get('streams', []): - formats.extend(self._extract_vrv_formats( + audio_lang = stream.get('audio_lang') + hardsub_lang = stream.get('hardsub_lang') + vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), - stream.get('audio_lang'), stream.get('hardsub_lang'))) + audio_lang, hardsub_lang) + for f in vrv_formats: + if not hardsub_lang: + f['preference'] = 1 + language_preference = 0 + if audio_lang == language: + language_preference += 1 + if hardsub_lang == language: + language_preference += 1 + if language_preference: + f['language_preference'] = language_preference + formats.extend(vrv_formats) if not formats: available_fmts = [] for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): @@ -557,7 +574,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'ext': 'flv', }) formats.append(format_info) - self._sort_formats(formats, ('height', 'width', 'tbr', 'fps')) + self._sort_formats(formats, ('preference', 'language_preference', 'height', 'width', 'tbr', 'fps')) metadata = self._call_rpc_api( 'VideoPlayer_GetMediaMetadata', video_id, From c11485162bcbf6f517fd9225850a048fc5f7cec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Sep 2018 22:13:39 +0700 Subject: [PATCH 457/488] [youtube] Don't pollute default query dict (closes #17593) --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2fe074cb4..e80e36f98 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -259,7 +259,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True def _download_webpage_handle(self, *args, **kwargs): - kwargs.setdefault('query', {})['disable_polymer'] = 'true' + query = kwargs.get('query', {}).copy() + query['disable_polymer'] = 'true' + kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) From e2f61598be49b75f16b501f7a4d24f4c8c230c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Sep 2018 22:14:28 +0700 Subject: [PATCH 458/488] [twitch] Don't pollute default headers dict --- youtube_dl/extractor/twitch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 26661fdf6..401615683 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -51,7 +51,9 @@ class TwitchBaseIE(InfoExtractor): expected=True) def _call_api(self, path, item_id, *args, **kwargs): - kwargs.setdefault('headers', {})['Client-ID'] = self._CLIENT_ID + headers = kwargs.get('headers', {}).copy() + headers['Client-ID'] = self._CLIENT_ID + kwargs['headers'] = headers response = self._download_json( '%s/%s' % (self._API_BASE, path), item_id, *args, **compat_kwargs(kwargs)) From c8f6ab8c38f8397b06028cdea2839d6e25e2c809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Sep 2018 22:14:53 +0700 Subject: [PATCH 459/488] [udemy] Don't pollute default headers dict --- youtube_dl/extractor/udemy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 79c45f80e..105826e9b 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -122,7 +122,9 @@ class UdemyIE(InfoExtractor): raise ExtractorError(error_str, expected=True) def _download_webpage_handle(self, *args, **kwargs): - kwargs.setdefault('headers', {})['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' + headers = kwargs.get('headers', {}).copy() + headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' + kwargs['headers'] = headers return super(UdemyIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) From e504b0907082512d5252643d7fd8c5ca67225e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Sep 2018 22:15:27 +0700 Subject: [PATCH 460/488] [adobepass] Don't pollute default headers dict --- youtube_dl/extractor/adobepass.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index b83b51efb..1cf2dcbf3 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -1325,8 +1325,8 @@ class AdobePassIE(InfoExtractor): _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' def _download_webpage_handle(self, *args, **kwargs): - headers = kwargs.get('headers', {}) - headers.update(self.geo_verification_headers()) + headers = self.geo_verification_headers() + headers.update(kwargs.get('headers', {})) kwargs['headers'] = headers return super(AdobePassIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) From d9b1cec1712c39d7e98a4ca44051643eebef8b30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 18 Sep 2018 01:44:55 +0700 Subject: [PATCH 461/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index d184f69ee..b2b2d90a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version <unreleased> + +Core ++ [extractor/common] Introduce channel meta fields + +Extractors +* [adobepass] Don't pollute default headers dict +* [udemy] Don't pollute default headers dict +* [twitch] Don't pollute default headers dict +* [youtube] Don't pollute default query dict (#17593) +* [crunchyroll] Prefer hardsubless formats and formats in locale language +* [vrv] Make format ids deterministic +* [vimeo] Fix ondemand playlist extraction (#14591) ++ [pornhub] Extract upload date (#17574) ++ [porntube] Extract channel meta fields ++ [vimeo] Extract channel meta fields ++ [youtube] Extract channel meta fields (#9676, #12939) +* [porntube] Fix extraction (#17541) +* [asiancrush] Fix extraction (#15630) ++ [twitch:clips] Extend URL regular expression (closes #17559) ++ [vzaar] Add support for HLS +* [tube8] Fix metadata extraction (#17520) +* [eporner] Extract JSON-LD (#17519) + + version 2018.09.10 Core From 3a9c928426258e51c74515b09f7c6f565d60efe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 18 Sep 2018 01:46:36 +0700 Subject: [PATCH 462/488] release 2018.09.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f41266f32..a4602287a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.09.10 +[debug] youtube-dl version 2018.09.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index b2b2d90a1..800ece790 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.09.18 Core + [extractor/common] Introduce channel meta fields diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b078c4993..2b3b584a4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.09.10' +__version__ = '2018.09.18' From 28fcb7b06178cebfc361f0d1c491ea9507b0d75e Mon Sep 17 00:00:00 2001 From: Leonardo Taccari <iamleot@gmail.com> Date: Wed, 19 Sep 2018 04:48:39 +0200 Subject: [PATCH 463/488] [raiplay:playlist] Remove a debug leftover print() --- youtube_dl/extractor/rai.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index f916b2619..548a6553b 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -274,7 +274,6 @@ class RaiPlayPlaylistIE(InfoExtractor): ('programma', 'nomeProgramma'), webpage, 'title') description = unescapeHTML(self._html_search_meta( ('description', 'og:description'), webpage, 'description')) - print(description) entries = [] for mobj in re.finditer( From 4ac73fc170f41ccb2d468a0b8563b2ec39d9e9b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 19 Sep 2018 22:16:43 +0700 Subject: [PATCH 464/488] [popcorntv] Remove debug output --- youtube_dl/extractor/popcorntv.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py index ac901f426..9f834fb6c 100644 --- a/youtube_dl/extractor/popcorntv.py +++ b/youtube_dl/extractor/popcorntv.py @@ -58,8 +58,6 @@ class PopcornTVIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) timestamp = unified_timestamp(self._html_search_meta( 'uploadDate', webpage, 'timestamp')) - print(self._html_search_meta( - 'duration', webpage)) duration = int_or_none(self._html_search_meta( 'duration', webpage), invscale=60) view_count = int_or_none(self._html_search_meta( From 21160a179256cfbb859948138905ffe67cb970a8 Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Sun, 23 Sep 2018 16:34:47 +0200 Subject: [PATCH 465/488] [zattoo] Fix extraction (closes #17175) --- youtube_dl/extractor/zattoo.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index fb167c198..9c9024799 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -93,28 +93,30 @@ class ZattooBaseIE(InfoExtractor): def _extract_cid_and_video_info(self, video_id): data = self._download_json( - '%s/zapi/program/details' % self._HOST_URL, + '%s/zapi/v2/cached/program/power_details/%s' % ( + self._HOST_URL, self._power_guide_hash), video_id, 'Downloading video information', query={ - 'program_id': video_id, - 'complete': True + 'program_ids': video_id, + 'complete': True, }) - p = data['program'] + p = data['programs'][0] cid = p['cid'] info_dict = { 'id': video_id, - 'title': p.get('title') or p['episode_title'], - 'description': p.get('description'), - 'thumbnail': p.get('image_url'), + 'title': p.get('t') or p['et'], + 'description': p.get('d'), + 'thumbnail': p.get('i_url'), 'creator': p.get('channel_name'), - 'episode': p.get('episode_title'), - 'episode_number': int_or_none(p.get('episode_number')), - 'season_number': int_or_none(p.get('season_number')), + 'episode': p.get('et'), + 'episode_number': int_or_none(p.get('e_no')), + 'season_number': int_or_none(p.get('s_no')), 'release_year': int_or_none(p.get('year')), - 'categories': try_get(p, lambda x: x['categories'], list), + 'categories': try_get(p, lambda x: x['c'], list), + 'tags': try_get(p, lambda x: x['g'], list) } return cid, info_dict From f6d7f7b474075955afb02c373b8d2a442faaa0a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 23 Sep 2018 23:30:18 +0700 Subject: [PATCH 466/488] [zattoo] Add support for more zattoo platform sites --- youtube_dl/extractor/extractors.py | 12 ++ youtube_dl/extractor/zattoo.py | 178 ++++++++++++++++++++++++++--- 2 files changed, 172 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7dc569724..464c8d690 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1455,8 +1455,20 @@ from .youtube import ( from .zapiks import ZapiksIE from .zaq1 import Zaq1IE from .zattoo import ( + BBVTVIE, + EinsUndEinsTVIE, + EWETVIE, + GlattvisionTVIE, + MNetTVIE, + MyVisionTVIE, + NetPlusIE, + OsnatelTVIE, + QuantumTVIE, QuicklineIE, QuicklineLiveIE, + SAKTVIE, + VTXTVIE, + WalyTVIE, ZattooIE, ZattooLiveIE, ) diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 9c9024799..bbe0aecb6 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -18,12 +18,12 @@ from ..utils import ( ) -class ZattooBaseIE(InfoExtractor): - _NETRC_MACHINE = 'zattoo' - _HOST_URL = 'https://zattoo.com' - +class ZattooPlatformBaseIE(InfoExtractor): _power_guide_hash = None + def _host_url(self): + return 'https://%s' % self._HOST + def _login(self): username, password = self._get_login_info() if not username or not password: @@ -33,13 +33,13 @@ class ZattooBaseIE(InfoExtractor): try: data = self._download_json( - '%s/zapi/v2/account/login' % self._HOST_URL, None, 'Logging in', + '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in', data=urlencode_postdata({ 'login': username, 'password': password, 'remember': 'true', }), headers={ - 'Referer': '%s/login' % self._HOST_URL, + 'Referer': '%s/login' % self._host_url(), 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }) except ExtractorError as e: @@ -53,7 +53,7 @@ class ZattooBaseIE(InfoExtractor): def _real_initialize(self): webpage = self._download_webpage( - self._HOST_URL, None, 'Downloading app token') + self._host_url(), None, 'Downloading app token') app_token = self._html_search_regex( r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', webpage, 'app token', group='token') @@ -62,7 +62,7 @@ class ZattooBaseIE(InfoExtractor): # Will setup appropriate cookies self._request_webpage( - '%s/zapi/v2/session/hello' % self._HOST_URL, None, + '%s/zapi/v2/session/hello' % self._host_url(), None, 'Opening session', data=urlencode_postdata({ 'client_app_token': app_token, 'uuid': compat_str(uuid4()), @@ -75,7 +75,7 @@ class ZattooBaseIE(InfoExtractor): def _extract_cid(self, video_id, channel_name): channel_groups = self._download_json( - '%s/zapi/v2/cached/channels/%s' % (self._HOST_URL, + '%s/zapi/v2/cached/channels/%s' % (self._host_url(), self._power_guide_hash), video_id, 'Downloading channel list', query={'details': False})['channel_groups'] @@ -94,7 +94,7 @@ class ZattooBaseIE(InfoExtractor): def _extract_cid_and_video_info(self, video_id): data = self._download_json( '%s/zapi/v2/cached/program/power_details/%s' % ( - self._HOST_URL, self._power_guide_hash), + self._host_url(), self._power_guide_hash), video_id, 'Downloading video information', query={ @@ -128,11 +128,11 @@ class ZattooBaseIE(InfoExtractor): if is_live: postdata_common.update({'timeshift': 10800}) - url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) + url = '%s/zapi/watch/live/%s' % (self._host_url(), cid) elif record_id: - url = '%s/zapi/watch/recording/%s' % (self._HOST_URL, record_id) + url = '%s/zapi/watch/recording/%s' % (self._host_url(), record_id) else: - url = '%s/zapi/watch/recall/%s/%s' % (self._HOST_URL, cid, video_id) + url = '%s/zapi/watch/recall/%s/%s' % (self._host_url(), cid, video_id) formats = [] for stream_type in ('dash', 'hls', 'hls5', 'hds'): @@ -203,13 +203,13 @@ class ZattooBaseIE(InfoExtractor): return info_dict -class QuicklineBaseIE(ZattooBaseIE): +class QuicklineBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'quickline' - _HOST_URL = 'https://mobiltv.quickline.com' + _HOST = 'mobiltv.quickline.com' class QuicklineIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' % re.escape(QuicklineBaseIE._HOST) _TEST = { 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', @@ -222,7 +222,7 @@ class QuicklineIE(QuicklineBaseIE): class QuicklineLiveIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<id>[^/]+)' % re.escape(QuicklineBaseIE._HOST) _TEST = { 'url': 'https://mobiltv.quickline.com/watch/srf1', @@ -238,8 +238,18 @@ class QuicklineLiveIE(QuicklineBaseIE): return self._extract_video(channel_name, video_id, is_live=True) +class ZattooBaseIE(ZattooPlatformBaseIE): + _NETRC_MACHINE = 'zattoo' + _HOST = 'zattoo.com' + + +def _make_valid_url(tmpl, host): + return tmpl % re.escape(host) + + class ZattooIE(ZattooBaseIE): - _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + _VALID_URL_TEMPLATE = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + _VALID_URL = _make_valid_url(_VALID_URL_TEMPLATE, ZattooBaseIE._HOST) # Since regular videos are only available for 7 days and recorded videos # are only available for a specific user, we cannot have detailed tests. @@ -271,3 +281,135 @@ class ZattooLiveIE(ZattooBaseIE): def _real_extract(self, url): channel_name = video_id = self._match_id(url) return self._extract_video(channel_name, video_id, is_live=True) + + +class NetPlusIE(ZattooIE): + _NETRC_MACHINE = 'netplus' + _HOST = 'netplus.tv' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.netplus.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MNetTVIE(ZattooIE): + _NETRC_MACHINE = 'mnettv' + _HOST = 'tvplus.m-net.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.tvplus.m-net.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class WalyTVIE(ZattooIE): + _NETRC_MACHINE = 'walytv' + _HOST = 'player.waly.tv' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.player.waly.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class BBVTVIE(ZattooIE): + _NETRC_MACHINE = 'bbvtv' + _HOST = 'bbv-tv.net' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.bbv-tv.net/watch/abc/123-abc', + 'only_matching': True, + }] + + +class VTXTVIE(ZattooIE): + _NETRC_MACHINE = 'vtxtv' + _HOST = 'vtxtv.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.vtxtv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MyVisionTVIE(ZattooIE): + _NETRC_MACHINE = 'myvisiontv' + _HOST = 'myvisiontv.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.myvisiontv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class GlattvisionTVIE(ZattooIE): + _NETRC_MACHINE = 'glattvisiontv' + _HOST = 'iptv.glattvision.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.iptv.glattvision.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class SAKTVIE(ZattooIE): + _NETRC_MACHINE = 'saktv' + _HOST = 'saktv.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.saktv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EWETVIE(ZattooIE): + _NETRC_MACHINE = 'ewetv' + _HOST = 'tvonline.ewe.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.tvonline.ewe.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class QuantumTVIE(ZattooIE): + _NETRC_MACHINE = 'quantumtv' + _HOST = 'quantum-tv.com' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.quantum-tv.com/watch/abc/123-abc', + 'only_matching': True, + }] + + +class OsnatelTVIE(ZattooIE): + _NETRC_MACHINE = 'osnateltv' + _HOST = 'onlinetv.osnatel.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.onlinetv.osnatel.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EinsUndEinsTVIE(ZattooIE): + _NETRC_MACHINE = '1und1tv' + _HOST = '1und1.tv' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.1und1.tv/watch/abc/123-abc', + 'only_matching': True, + }] From cd5a74a28e7102e877d00b1e3cffba82d28b2f2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 24 Sep 2018 00:14:49 +0700 Subject: [PATCH 467/488] [youtube] Add support for invidio.us (closes #17613) --- youtube_dl/extractor/youtube.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e80e36f98..78203ef84 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -349,6 +349,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + (?:www\.)?invidio\.us/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -1068,6 +1069,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, }, + { + 'url': 'https://invidio.us/watch?v=BaW_jenozKc', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -2419,7 +2424,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -2440,6 +2445,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', }, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, }] @classmethod From 60ce0c67fd1ef71463af2c036bbabf06ec26bd98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 25 Sep 2018 23:43:41 +0700 Subject: [PATCH 468/488] [README.md] Document channel meta fields for output template --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index dd068a462..fdd115c9b 100644 --- a/README.md +++ b/README.md @@ -511,6 +511,8 @@ The basic usage is not to set any template arguments when downloading a single f - `timestamp` (numeric): UNIX timestamp of the moment the video became available - `upload_date` (string): Video upload date (YYYYMMDD) - `uploader_id` (string): Nickname or id of the video uploader + - `channel` (string): Full name of the channel the video is uploaded on + - `channel_id` (string): Id of the channel - `location` (string): Physical location where the video was filmed - `duration` (numeric): Length of the video in seconds - `view_count` (numeric): How many users have watched the video on the platform From 8fd12a083131550476fb771c180a0734794d0b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 26 Sep 2018 05:38:41 +0700 Subject: [PATCH 469/488] [mediaset] Improve embed support (closes #17668) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/mediaset.py | 38 +++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 76ef01332..2a48667f0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3023,7 +3023,7 @@ class GenericIE(InfoExtractor): wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(webpage) + mediaset_urls = MediasetIE._extract_urls(self, webpage) if mediaset_urls: return self.playlist_from_matches( mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 57f97409d..df3748798 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -4,6 +4,11 @@ from __future__ import unicode_literals import re from .theplatform import ThePlatformBaseIE +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -76,12 +81,33 @@ class MediasetIE(ThePlatformBaseIE): }] @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', - webpage)] + def _extract_urls(ie, webpage): + def _qs(url): + return compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + def _program_guid(qs): + return qs.get('programGuid', [None])[0] + + entries = [] + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', + webpage): + embed_url = mobj.group('url') + embed_qs = _qs(embed_url) + program_guid = _program_guid(embed_qs) + if program_guid: + entries.append(embed_url) + continue + video_id = embed_qs.get('id', [None])[0] + if not video_id: + continue + urlh = ie._request_webpage( + embed_url, video_id, note='Following embed URL redirect') + embed_url = compat_str(urlh.geturl()) + program_guid = _program_guid(_qs(embed_url)) + if program_guid: + entries.append(embed_url) + return entries def _real_extract(self, url): guid = self._match_id(url) From c17e100b96e3a1c1d2115226c9a2f166c8338031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 26 Sep 2018 09:27:40 +0700 Subject: [PATCH 470/488] [pluralsight] Fix subtitles extraction (closes #17671) --- youtube_dl/extractor/pluralsight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 1257841e4..ec67381bb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -213,7 +213,7 @@ query viewClip { def _get_subtitles(self, author, clip_idx, lang, name, duration, video_id): captions_post = { 'a': author, - 'cn': clip_idx, + 'cn': int(clip_idx), 'lc': lang, 'm': name, } From 3d3499742c88ce89de5b49dff4e96483b089fcaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 26 Sep 2018 11:56:15 +0700 Subject: [PATCH 471/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ChangeLog b/ChangeLog index 800ece790..15b875297 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +version <unreleased> + +Extractors +* [pluralsight] Fix subtitles extraction (#17671) +* [mediaset] Improve embed support (#17668) ++ [youtube] Add support for invidio.us (#17613) ++ [zattoo] Add support for more zattoo platform sites +* [zattoo] Fix extraction (#17175, #17542) + + version 2018.09.18 Core From 4c89a675dd50429a1a5f5903e624db0204d2435a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 26 Sep 2018 11:58:25 +0700 Subject: [PATCH 472/488] release 2018.09.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 12 ++++++++++++ youtube_dl/version.py | 2 +- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index a4602287a..ed3e0a157 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.09.18 +[debug] youtube-dl version 2018.09.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 15b875297..241712037 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.09.26 Extractors * [pluralsight] Fix subtitles extraction (#17671) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9b8601751..736ab6da7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -98,6 +98,7 @@ - **bbc.co.uk:article**: BBC articles - **bbc.co.uk:iplayer:playlist** - **bbc.co.uk:playlist** + - **BBVTV** - **Beatport** - **Beeg** - **BehindKink** @@ -251,6 +252,7 @@ - **egghead:course**: egghead.io course - **egghead:lesson**: egghead.io lesson - **eHow** + - **EinsUndEinsTV** - **Einthusan** - **eitb.tv** - **EllenTube** @@ -268,6 +270,7 @@ - **EsriVideo** - **Europa** - **EveryonesMixtape** + - **EWETV** - **ExpoTV** - **Expressen** - **ExtremeTube** @@ -327,6 +330,7 @@ - **Gfycat** - **GiantBomb** - **Giga** + - **GlattvisionTV** - **Glide**: Glide mobile video messages (glide.me) - **Globo** - **GloboArticle** @@ -494,6 +498,7 @@ - **Mixer:vod** - **MLB** - **Mnet** + - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** @@ -525,6 +530,7 @@ - **Myvi** - **MyVidster** - **MyviEmbed** + - **MyVisionTV** - **n-tv.de** - **natgeo** - **natgeo:episodeguide** @@ -550,6 +556,7 @@ - **netease:program**: 网易云音乐 - 电台节目 - **netease:singer**: 网易云音乐 - 歌手 - **netease:song**: 网易云音乐 + - **NetPlus** - **Netzkino** - **Newgrounds** - **NewgroundsPlaylist** @@ -626,6 +633,7 @@ - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek + - **OsnatelTV** - **PacktPub** - **PacktPubCourse** - **PandaTV**: 熊猫TV @@ -686,6 +694,7 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **QuantumTV** - **Quickline** - **QuicklineLive** - **R7** @@ -753,6 +762,7 @@ - **safari**: safaribooksonline.com online video - **safari:api** - **safari:course**: safaribooksonline.com online courses + - **SAKTV** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -1035,12 +1045,14 @@ - **vrv** - **vrv:series** - **VShare** + - **VTXTV** - **vube**: Vube.com - **VuClip** - **VVVVID** - **VyboryMos** - **Vzaar** - **Walla** + - **WalyTV** - **washingtonpost** - **washingtonpost:article** - **wat.tv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b3b584a4..6f2cc31df 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.09.18' +__version__ = '2018.09.26' From 85cd69adcb41454f30d4db85eeb4d5cc26b6bb5d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 26 Sep 2018 08:13:16 +0100 Subject: [PATCH 473/488] [hotstar] fix extraction(closes #14694)(closes #14931)(closes #17637) --- youtube_dl/extractor/hotstar.py | 159 +++++++++++++++----------------- 1 file changed, 73 insertions(+), 86 deletions(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index d28af36ec..354ac00dc 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -1,10 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import hashlib +import hmac +import time from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -13,37 +15,40 @@ from ..utils import ( class HotStarBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['IN'] + _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' - def _download_json(self, *args, **kwargs): - response = super(HotStarBaseIE, self)._download_json(*args, **kwargs) - if response['resultCode'] != 'OK': - if kwargs.get('fatal'): - raise ExtractorError( - response['errorDescription'], expected=True) - return None - return response['resultObj'] - - def _download_content_info(self, content_id): - return self._download_json( - 'https://account.hotstar.com/AVS/besc', content_id, query={ - 'action': 'GetAggregatedContentDetails', - 'appVersion': '5.0.40', - 'channel': 'PCTV', - 'contentId': content_id, - })['contentInfo'][0] + def _call_api(self, path, video_id, query_name='contentId'): + st = int(time.time()) + exp = st + 6000 + auth = 'st=%d~exp=%d~acl=/*' % (st, exp) + auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() + response = self._download_json( + 'https://api.hotstar.com/' + path, + video_id, headers={ + 'hotstarauth': auth, + 'x-country-code': 'IN', + 'x-platform-code': 'JIO', + }, query={ + query_name: video_id, + 'tas': 10000, + }) + if response['statusCode'] != 'OK': + raise ExtractorError( + response['body']['message'], expected=True) + return response['body']['results'] class HotStarIE(HotStarBaseIE): + IE_NAME = 'hotstar' _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' _TESTS = [{ - 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', + 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { 'id': '1000076273', 'ext': 'mp4', - 'title': 'On Air With AIB', + 'title': 'Can You Not Spread Rumours?', 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', - 'timestamp': 1447227000, + 'timestamp': 1447248600, 'upload_date': '20151111', 'duration': 381, }, @@ -58,47 +63,43 @@ class HotStarIE(HotStarBaseIE): 'url': 'http://www.hotstar.com/1000000515', 'only_matching': True, }] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_content_info(video_id) + webpage = self._download_webpage(url, video_id) + app_state = self._parse_json(self._search_regex( + r'<script>window\.APP_STATE\s*=\s*({.+?})</script>', + webpage, 'app state'), video_id) + video_data = list(app_state.values())[0]['initialState']['contentData']['content'] - title = video_data['episodeTitle'] + title = video_data['title'] - if video_data.get('encrypted') == 'Y': + if video_data.get('drmProtected'): raise ExtractorError('This video is DRM protected.', expected=True) formats = [] - for f in ('JIO',): - format_data = self._download_json( - 'http://getcdn.hotstar.com/AVS/besc', - video_id, 'Downloading %s JSON metadata' % f, - fatal=False, query={ - 'action': 'GetCDN', - 'asJson': 'Y', - 'channel': f, - 'id': video_id, - 'type': 'VOD', - }) - if format_data: - format_url = format_data.get('src') - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - # produce broken files - continue - else: - formats.append({ - 'url': format_url, - 'width': int_or_none(format_data.get('width')), - 'height': int_or_none(format_data.get('height')), - }) + format_data = self._call_api('h/v1/play', video_id)['item'] + format_url = format_data['playbackUrl'] + ext = determine_ext(format_url) + if ext == 'm3u8': + try: + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted(countries=['IN']) + raise + elif ext == 'f4m': + # produce broken files + pass + else: + formats.append({ + 'url': format_url, + 'width': int_or_none(format_data.get('width')), + 'height': int_or_none(format_data.get('height')), + }) self._sort_formats(formats) return { @@ -106,57 +107,43 @@ class HotStarIE(HotStarBaseIE): 'title': title, 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), - 'timestamp': int_or_none(video_data.get('broadcastDate')), + 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), 'formats': formats, + 'channel': video_data.get('channelName'), + 'channel_id': video_data.get('channelId'), + 'series': video_data.get('showName'), + 'season': video_data.get('seasonName'), + 'season_number': int_or_none(video_data.get('seasonNo')), + 'season_id': video_data.get('seasonId'), 'episode': title, - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('contentTitle'), + 'episode_number': int_or_none(video_data.get('episodeNo')), } class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com/tv/[^/]+/(?P<content_id>\d+))/(?P<type>[^/]+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ - 'url': 'http://www.hotstar.com/tv/pratidaan/14982/episodes/14812/9993', + 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { - 'id': '14812', + 'id': '3_2_26', }, - 'playlist_mincount': 75, + 'playlist_mincount': 20, }, { - 'url': 'http://www.hotstar.com/tv/pratidaan/14982/popular-clips/9998/9998', + 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, }] - _ITEM_TYPES = { - 'episodes': 'EPISODE', - 'popular-clips': 'CLIPS', - } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base_url = mobj.group('url') - content_id = mobj.group('content_id') - playlist_type = mobj.group('type') + playlist_id = self._match_id(url) - content_info = self._download_content_info(content_id) - playlist_id = compat_str(content_info['categoryId']) - - collection = self._download_json( - 'https://search.hotstar.com/AVS/besc', playlist_id, query={ - 'action': 'SearchContents', - 'appVersion': '5.0.40', - 'channel': 'PCTV', - 'moreFilters': 'series:%s;' % playlist_id, - 'query': '*', - 'searchOrder': 'last_broadcast_date desc,year desc,title asc', - 'type': self._ITEM_TYPES.get(playlist_type, 'EPISODE'), - }) + collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId') entries = [ self.url_result( - '%s/_/%s' % (base_url, video['contentId']), + 'https://www.hotstar.com/%s' % video['contentId'], ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in collection['response']['docs'] + for video in collection['assets']['items'] if video.get('contentId')] return self.playlist_result(entries, playlist_id) From 245cbb33bcb7d519c897277e65acdcbc45335233 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 28 Sep 2018 15:13:25 +0100 Subject: [PATCH 474/488] [spike] fix Paramount Network extraction(closes #17677) --- youtube_dl/extractor/spike.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index e76522b45..6090e0066 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -44,3 +44,10 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] + + def _extract_mgid(self, webpage): + cs = self._parse_json(self._search_regex( + r'window\.__DATA__\s*=\s*({.+})', + webpage, 'data'), None)['children'] + c = next(c for c in cs if c.get('type') == 'VideoPlayer') + return c['props']['media']['video']['config']['uri'] From 85fa80d5f9d867bab706b92d15a6c9ad5b862b47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 1 Oct 2018 21:13:43 +0700 Subject: [PATCH 475/488] [vimeo] Add another config regex (closes #17690) --- youtube_dl/extractor/vimeo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0a9239b62..88f4d9979 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -551,6 +551,7 @@ class VimeoIE(VimeoBaseInfoExtractor): else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') + config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') config = self._search_regex(config_re, webpage, 'info section', flags=re.DOTALL) config = json.loads(config) From 365343131d752bece96d2279a3e0bcd7e9f0000f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 1 Oct 2018 21:45:24 +0700 Subject: [PATCH 476/488] [pluralsight] Fix subtitles extraction (closes #17726, closes #17728) --- youtube_dl/extractor/pluralsight.py | 34 ++++++++++++++++++----------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index ec67381bb..daf172570 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -210,18 +210,26 @@ query viewClip { raise ExtractorError('Unable to log in') - def _get_subtitles(self, author, clip_idx, lang, name, duration, video_id): - captions_post = { - 'a': author, - 'cn': int(clip_idx), - 'lc': lang, - 'm': name, - } - captions = self._download_json( - '%s/player/retrieve-captions' % self._API_BASE, video_id, - 'Downloading captions JSON', 'Unable to download captions JSON', - fatal=False, data=json.dumps(captions_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) + def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): + captions = None + if clip_id: + captions = self._download_json( + '%s/transcript/api/v1/caption/json/%s/%s' + % (self._API_BASE, clip_id, lang), video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False) + if not captions: + captions_post = { + 'a': author, + 'cn': int(clip_idx), + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) if captions: return { lang: [{ @@ -413,7 +421,7 @@ query viewClip { # TODO: other languages? subtitles = self.extract_subtitles( - author, clip_idx, 'en', name, duration, display_id) + author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) return { 'id': clip_id, From 9795d93316cb63e0e2a73ee840600be7f13e19f5 Mon Sep 17 00:00:00 2001 From: Enes <enessolak99@gmail.com> Date: Mon, 1 Oct 2018 17:48:59 +0300 Subject: [PATCH 477/488] [openload] Add support for oload.cloud (closes #17710) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d264fe206..dc01b6346 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -307,6 +307,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.download/f/kUEfGclsU9o', 'only_matching': True, + }, { + 'url': 'https://oload.cloud/f/4ZDnBXRWiB8', + 'only_matching': True, }, { # Its title has not got its extension but url has it 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', From 3c7da54c92b7560c3ceeb8b58a67d4759ed843d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 1 Oct 2018 22:05:18 +0700 Subject: [PATCH 478/488] [jamendo] Add support for licensing.jamendo.com (closes #17724) --- youtube_dl/extractor/jamendo.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 595d7a5b7..c21827618 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -26,8 +26,15 @@ class JamendoBaseIE(InfoExtractor): class JamendoIE(JamendoBaseIE): - _VALID_URL = r'https?://(?:www\.)?jamendo\.com/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'''(?x) + https?:// + (?: + licensing\.jamendo\.com/[^/]+| + (?:www\.)?jamendo\.com + ) + /track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+) + ''' + _TESTS = [{ 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', 'md5': '6e9e82ed6db98678f171c25a8ed09ffd', 'info_dict': { @@ -40,14 +47,19 @@ class JamendoIE(JamendoBaseIE): 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg' } - } + }, { + 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', + 'only_matching': True, + }] def _real_extract(self, url): mobj = self._VALID_URL_RE.match(url) track_id = mobj.group('id') display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage( + 'https://www.jamendo.com/track/%s/%s' % (track_id, display_id), + display_id) title, artist, track = self._extract_meta(webpage) From 66d106f2705d565675b6a94fb0e7e0bdfe132065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 1 Oct 2018 23:29:24 +0700 Subject: [PATCH 479/488] [philharmoniedeparis] Fix extraction and add support for pad.philharmoniedeparis.fr (closes #17705) --- youtube_dl/extractor/philharmoniedeparis.py | 118 ++++++++++++-------- 1 file changed, 70 insertions(+), 48 deletions(-) diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py index f1008ae51..f723a2b3b 100644 --- a/youtube_dl/extractor/philharmoniedeparis.py +++ b/youtube_dl/extractor/philharmoniedeparis.py @@ -2,31 +2,38 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - float_or_none, - int_or_none, - parse_iso8601, - xpath_text, + try_get, + urljoin, ) class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'https?://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)| + pad\.philharmoniedeparis\.fr/doc/CIMU/ + ) + (?P<id>\d+) + ''' _TESTS = [{ + 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', + 'md5': 'a0a4b195f544645073631cbec166a2c2', + 'info_dict': { + 'id': '1086697', + 'ext': 'mp4', + 'title': 'Jazz à la Villette : Knower', + }, + }, { 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { 'id': '1032066', - 'ext': 'flv', - 'title': 'md5:d1f5585d87d041d07ce9434804bc8425', - 'timestamp': 1428179400, - 'upload_date': '20150404', - 'duration': 6592.278, + 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'playlist_mincount': 2, }, { 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', 'only_matching': True, @@ -34,45 +41,60 @@ class PhilharmonieDeParisIE(InfoExtractor): 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', 'only_matching': True, }] + _LIVE_URL = 'https://live.philharmoniedeparis.fr' def _real_extract(self, url): video_id = self._match_id(url) - concert = self._download_xml( - 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=%s' % video_id, - video_id).find('./concert') + config = self._download_json( + '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ + 'id': video_id, + 'lang': 'fr-FR', + }) - formats = [] - info_dict = { - 'id': video_id, - 'title': xpath_text(concert, './titre', 'title', fatal=True), - 'formats': formats, - } - - fichiers = concert.find('./fichiers') - stream = fichiers.attrib['serveurstream'] - for fichier in fichiers.findall('./fichier'): - info_dict['duration'] = float_or_none(fichier.get('timecodefin')) - for quality, (format_id, suffix) in enumerate([('lq', ''), ('hq', '_hd')]): - format_url = fichier.get('url%s' % suffix) - if not format_url: + def extract_entry(source): + if not isinstance(source, dict): + return + title = source.get('title') + if not title: + return + files = source.get('files') + if not isinstance(files, dict): + return + format_urls = set() + formats = [] + for format_id in ('mobile', 'desktop'): + format_url = try_get( + files, lambda x: x[format_id]['file'], compat_str) + if not format_url or format_url in format_urls: continue - formats.append({ - 'url': stream, - 'play_path': format_url, - 'ext': 'flv', - 'format_id': format_id, - 'width': int_or_none(concert.get('largeur%s' % suffix)), - 'height': int_or_none(concert.get('hauteur%s' % suffix)), - 'quality': quality, - }) - self._sort_formats(formats) + format_urls.add(format_url) + m3u8_url = urljoin(self._LIVE_URL, format_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + if not formats: + return + self._sort_formats(formats) + return { + 'title': title, + 'formats': formats, + } - date, hour = concert.get('date'), concert.get('heure') - if date and hour: - info_dict['timestamp'] = parse_iso8601( - '%s-%s-%sT%s:00' % (date[0:4], date[4:6], date[6:8], hour)) - elif date: - info_dict['upload_date'] = date + thumbnail = urljoin(self._LIVE_URL, config.get('image')) - return info_dict + info = extract_entry(config) + if info: + info.update({ + 'id': video_id, + 'thumbnail': thumbnail, + }) + return info + + entries = [] + for num, chapter in enumerate(config['chapters'], start=1): + entry = extract_entry(chapter) + entry['id'] = '%s-%d' % (video_id, num) + entries.append(entry) + + return self.playlist_result(entries, video_id, config.get('title')) From 05e7c184da85f83b254bc3d138f89b11da802bdb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 2 Oct 2018 06:07:06 +0100 Subject: [PATCH 480/488] [hotstar] fix extraction in python 2(closes #17696) --- youtube_dl/extractor/hotstar.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 354ac00dc..bf5717f1b 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + try_get, ) @@ -72,7 +73,11 @@ class HotStarIE(HotStarBaseIE): app_state = self._parse_json(self._search_regex( r'<script>window\.APP_STATE\s*=\s*({.+?})</script>', webpage, 'app state'), video_id) - video_data = list(app_state.values())[0]['initialState']['contentData']['content'] + video_data = {} + for v in app_state.values(): + content = try_get(v, lambda x: x['initialState']['contentData']['content'], dict) + if content and content.get('contentId') == video_id: + video_data = content title = video_data['title'] From d98cb62e552ee74079fda2e4173a40b14faac3fe Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 2 Oct 2018 19:43:06 +0100 Subject: [PATCH 481/488] [crunchyroll] switch to HTTPS for RpcApi(closes #17749) --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index af786d096..045be0ab5 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -45,7 +45,7 @@ class CrunchyrollBaseIE(InfoExtractor): data['req'] = 'RpcApi' + method data = compat_urllib_parse_urlencode(data).encode('utf-8') return self._download_xml( - 'http://www.crunchyroll.com/xml/', + 'https://www.crunchyroll.com/xml/', video_id, note, fatal=False, data=data, headers={ 'Content-Type': 'application/x-www-form-urlencoded', }) From f60b9803a473da8e324313d01af91e5676792c77 Mon Sep 17 00:00:00 2001 From: Enes <enessolak99@gmail.com> Date: Sat, 29 Sep 2018 13:28:56 +0300 Subject: [PATCH 482/488] [dailymotion] Fix extraction (closes #17699) --- youtube_dl/extractor/dailymotion.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 040f0bd02..842d9a259 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -24,6 +24,7 @@ from ..utils import ( str_to_int, unescapeHTML, urlencode_postdata, + try_get, ) @@ -172,7 +173,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) - metadata = player['metadata'] + metadata = try_get( + player, lambda x: x['metadata'], dict) or self._download_json( + 'http://www.dailymotion.com/player/metadata/video/%s' % video_id, video_id, query={ + 'integration': 'inline', + 'GK_PV5_NEON': '1', + }) if metadata.get('error', {}).get('type') == 'password_protected': password = self._downloader.params.get('videopassword') From 0082f44a08e33712fcd33ceabab15215c962eaac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 5 Oct 2018 02:02:58 +0700 Subject: [PATCH 483/488] [dailymotion] Improve metadata extraction (closes #17706) --- youtube_dl/extractor/dailymotion.py | 32 ++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 842d9a259..1816c559e 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -22,9 +22,11 @@ from ..utils import ( parse_iso8601, sanitized_Request, str_to_int, - unescapeHTML, - urlencode_postdata, try_get, + unescapeHTML, + update_url_query, + url_or_none, + urlencode_postdata, ) @@ -172,15 +174,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): r'__PLAYER_CONFIG__\s*=\s*({.+?});'], webpage, 'player v5', default=None) if player_v5: - player = self._parse_json(player_v5, video_id) - metadata = try_get( - player, lambda x: x['metadata'], dict) or self._download_json( - 'http://www.dailymotion.com/player/metadata/video/%s' % video_id, video_id, query={ - 'integration': 'inline', - 'GK_PV5_NEON': '1', - }) + player = self._parse_json(player_v5, video_id, fatal=False) or {} + metadata = try_get(player, lambda x: x['metadata'], dict) + if not metadata: + metadata_url = url_or_none(try_get( + player, lambda x: x['context']['metadata_template_url1'])) + if metadata_url: + metadata_url = metadata_url.replace(':videoId', video_id) + else: + metadata_url = update_url_query( + 'https://www.dailymotion.com/player/metadata/video/%s' + % video_id, { + 'embedder': url, + 'integration': 'inline', + 'GK_PV5_NEON': '1', + }) + metadata = self._download_json( + metadata_url, video_id, 'Downloading metadata JSON') - if metadata.get('error', {}).get('type') == 'password_protected': + if try_get(metadata, lambda x: x['error']['type']) == 'password_protected': password = self._downloader.params.get('videopassword') if password: r = int(metadata['id'][1:], 36) From 21c1a00dd7dbb9f7551ca9809a194f6380dee7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 5 Oct 2018 02:27:14 +0700 Subject: [PATCH 484/488] [pluralsight] Improve authentication (closes #17762) --- youtube_dl/extractor/pluralsight.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index daf172570..eafe56897 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -4,6 +4,7 @@ import collections import json import os import random +import re from .common import InfoExtractor from ..compat import ( @@ -196,7 +197,10 @@ query viewClip { if error: raise ExtractorError('Unable to login: %s' % error, expected=True) - if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): BLOCKED = 'Your account has been blocked due to suspicious activity' if BLOCKED in response: raise ExtractorError( From 2e7ed29e3429c20e735f3f7dfceb1b13cf757037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 5 Oct 2018 02:29:52 +0700 Subject: [PATCH 485/488] [ChangeLog] Actualize [ci skip] --- ChangeLog | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ChangeLog b/ChangeLog index 241712037..e2757f891 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +version <unreleased> + +Extractors +* [pluralsight] Improve authentication (#17762) +* [dailymotion] Fix extraction (#17699) +* [crunchyroll] Switch to HTTPS for RpcApi (#17749) ++ [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) +* [philharmoniedeparis] Fix extraction (#17705) ++ [jamendo] Add support for licensing.jamendo.com (#17724) ++ [openload] Add support for oload.cloud (#17710) +* [pluralsight] Fix subtitles extraction (#17726, #17728) ++ [vimeo] Add another config regular expression (#17690) +* [spike] Fix Paramount Network extraction (#17677) +* [hotstar] Fix extraction (#14694, #14931, #17637) + + version 2018.09.26 Extractors From d96f976b0c36f65894380d3d831b0520d6260c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 5 Oct 2018 02:31:30 +0700 Subject: [PATCH 486/488] release 2018.10.05 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ed3e0a157..058eb4321 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.10.05*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.10.05** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.09.26 +[debug] youtube-dl version 2018.10.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e2757f891..86cf489b1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.10.05 Extractors * [pluralsight] Improve authentication (#17762) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 736ab6da7..f167a6ddc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -360,7 +360,7 @@ - **HitRecord** - **HornBunny** - **HotNewHipHop** - - **HotStar** + - **hotstar** - **hotstar:playlist** - **Howcast** - **HowStuffWorks** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6f2cc31df..7d3f25019 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.09.26' +__version__ = '2018.10.05' From c9d891f19a923f53132b49a1f5b97f344d92503c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 5 Oct 2018 20:11:01 +0100 Subject: [PATCH 487/488] [patreon] fix extraction(closes #14502)(closes #10471) --- youtube_dl/extractor/patreon.py | 160 ++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 9eb027679..6f73ed68d 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -2,52 +2,63 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) class PatreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P<id>[^&#]+)' - _TESTS = [ - { - 'url': 'http://www.patreon.com/creation?hid=743933', - 'md5': 'e25505eec1053a6e6813b8ed369875cc', - 'info_dict': { - 'id': '743933', - 'ext': 'mp3', - 'title': 'Episode 166: David Smalley of Dogma Debate', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.patreon.com/creation?hid=743933', + 'md5': 'e25505eec1053a6e6813b8ed369875cc', + 'info_dict': { + 'id': '743933', + 'ext': 'mp3', + 'title': 'Episode 166: David Smalley of Dogma Debate', + 'description': 'md5:713b08b772cd6271b9f3906683cfacdf', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + 'timestamp': 1406473987, + 'upload_date': '20140727', }, - { - 'url': 'http://www.patreon.com/creation?hid=754133', - 'md5': '3eb09345bf44bf60451b8b0b81759d0a', - 'info_dict': { - 'id': '754133', - 'ext': 'mp3', - 'title': 'CD 167 Extra', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + }, { + 'url': 'http://www.patreon.com/creation?hid=754133', + 'md5': '3eb09345bf44bf60451b8b0b81759d0a', + 'info_dict': { + 'id': '754133', + 'ext': 'mp3', + 'title': 'CD 167 Extra', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', }, - { - 'url': 'https://www.patreon.com/creation?hid=1682498', - 'info_dict': { - 'id': 'SU4fj_aEMVw', - 'ext': 'mp4', - 'title': 'I\'m on Patreon!', - 'uploader': 'TraciJHines', - 'thumbnail': 're:^https?://.*$', - 'upload_date': '20150211', - 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', - 'uploader_id': 'TraciJHines', - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - } + 'skip': 'Patron-only content', + }, { + 'url': 'https://www.patreon.com/creation?hid=1682498', + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'uploader': 'TraciJHines', + 'thumbnail': 're:^https?://.*$', + 'upload_date': '20150211', + 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', + 'uploader_id': 'TraciJHines', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, } - ] + }, { + 'url': 'https://www.patreon.com/posts/episode-166-of-743933', + 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/posts/743933', + 'only_matching': True, + }] # Currently Patreon exposes download URL via hidden CSS, so login is not # needed. Keeping this commented for when this inevitably changes. @@ -78,38 +89,43 @@ class PatreonIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage).strip() - - attach_fn = self._html_search_regex( - r'<div class="attach"><a target="_blank" href="([^"]+)">', - webpage, 'attachment URL', default=None) - embed = self._html_search_regex( - r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"', - webpage, 'embedded URL', default=None) - - if attach_fn is not None: - video_url = 'http://www.patreon.com' + attach_fn - thumbnail = self._og_search_thumbnail(webpage) - uploader = self._html_search_regex( - r'<strong>(.*?)</strong> is creating', webpage, 'uploader') - elif embed is not None: - return self.url_result(embed) - else: - playlist = self._parse_json(self._search_regex( - r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])', - webpage, 'playlist JSON'), - video_id, transform_source=js_to_json) - data = playlist[0] - video_url = self._proto_relative_url(data['mp3']) - thumbnail = self._proto_relative_url(data.get('cover')) - uploader = data.get('artist') - - return { + post = self._download_json( + 'https://www.patreon.com/api/posts/' + video_id, video_id) + attributes = post['data']['attributes'] + title = attributes['title'].strip() + image = attributes.get('image') or {} + info = { 'id': video_id, - 'url': video_url, - 'ext': 'mp3', 'title': title, - 'uploader': uploader, - 'thumbnail': thumbnail, + 'description': clean_html(attributes.get('content')), + 'thumbnail': image.get('large_url') or image.get('url'), + 'timestamp': parse_iso8601(attributes.get('published_at')), + 'like_count': int_or_none(attributes.get('like_count')), + 'comment_count': int_or_none(attributes.get('comment_count')), } + + for i in post.get('included', []): + i_type = i.get('type') + if i_type == 'attachment': + attachment_attributes = i.get('attributes') or {} + attachment_url = attachment_attributes.get('url') + if attachment_url: + info.update({ + 'url': attachment_url, + 'ext': determine_ext(attachment_attributes.get('name'), 'mp3'), + }) + elif i_type == 'user': + user_attributes = i.get('attributes') + if user_attributes: + info.update({ + 'uploader': user_attributes.get('full_name'), + 'uploader_url': user_attributes.get('url'), + }) + + if not info.get('url'): + info.update({ + '_type': 'url', + 'url': attributes['embed']['url'], + }) + + return info From 19a352854f5143b7cd120e990433d0fd40f617b0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 5 Oct 2018 22:45:04 +0100 Subject: [PATCH 488/488] [patreon] extract post_file url(#17792) --- youtube_dl/extractor/patreon.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 6f73ed68d..426dd8121 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -104,16 +104,18 @@ class PatreonIE(InfoExtractor): 'comment_count': int_or_none(attributes.get('comment_count')), } + def add_file(file_data): + file_url = file_data.get('url') + if file_url: + info.update({ + 'url': file_url, + 'ext': determine_ext(file_data.get('name'), 'mp3'), + }) + for i in post.get('included', []): i_type = i.get('type') if i_type == 'attachment': - attachment_attributes = i.get('attributes') or {} - attachment_url = attachment_attributes.get('url') - if attachment_url: - info.update({ - 'url': attachment_url, - 'ext': determine_ext(attachment_attributes.get('name'), 'mp3'), - }) + add_file(i.get('attributes') or {}) elif i_type == 'user': user_attributes = i.get('attributes') if user_attributes: @@ -122,6 +124,9 @@ class PatreonIE(InfoExtractor): 'uploader_url': user_attributes.get('url'), }) + if not info.get('url'): + add_file(attributes.get('post_file') or {}) + if not info.get('url'): info.update({ '_type': 'url',